From 9f48e5793cba2943bfd0ea76856eae551084e68b Mon Sep 17 00:00:00 2001
From: Peter O'Connor <peter.ed.oconnor@gmail.com>
Date: Wed, 31 May 2017 16:06:42 +0200
Subject: [PATCH 01/29] aahh

---
 plato/tools/common/online_predictors.py |  7 +++++++
 plato/tools/convnet/conv_specifiers.py  | 11 +++++++++++
 plato/tools/convnet/convnet.py          | 22 +++++++++++++++++-----
 3 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/plato/tools/common/online_predictors.py b/plato/tools/common/online_predictors.py
index 4bc6281..389b6d8 100644
--- a/plato/tools/common/online_predictors.py
+++ b/plato/tools/common/online_predictors.py
@@ -80,6 +80,13 @@ def predict(self, inputs):
     def train(self, inputs, labels):
         feedforward_module = self._function if isinstance(self._function, FeedForwardModule) else ParametrizedFeedForwardModule(self._function)
         feedforward_module.train(x=inputs, y=labels, optimizer=self._optimizer, assert_all_params_optimized = self.assert_all_params_optimized, cost_fcn=self._cost_function, regularization_cost=self._regularization_cost)
+        # if isinstance(self._function, FeedForwardModule):  # A bit ugly but oh well
+        # else:
+        #     outputs = self._function.train_call(inputs) if isinstance(self._function, FeedForwardModule) else self._function(inputs)
+        #     cost = self._cost_function(outputs, labels)
+        #     if self._regularization_cost is not None:
+        #         cost += self._regularization_cost(self._function.parameters)
+        #     self._optimizer.update_parameters(cost = cost, parameters = self._function.parameters)
 
     @property
     def parameters(self):
diff --git a/plato/tools/convnet/conv_specifiers.py b/plato/tools/convnet/conv_specifiers.py
index 117741b..ecba4b3 100644
--- a/plato/tools/convnet/conv_specifiers.py
+++ b/plato/tools/convnet/conv_specifiers.py
@@ -113,4 +113,15 @@ def shape_transfer(self, input_shape):
             return n_samples, self.w.shape[1]
 
 
+class ConvNetSpec(PrimativeSpecifier):
+
+    def __init__(self, layer_ordered_dict):
+        self.layer_ordered_dict = layer_ordered_dict
+
+    def shape_transfer(self):
+        raise NotImplementedError()
+
+
+
+
 # class ConvNetSpec
\ No newline at end of file
diff --git a/plato/tools/convnet/convnet.py b/plato/tools/convnet/convnet.py
index 3877aae..4ccd9fb 100644
--- a/plato/tools/convnet/convnet.py
+++ b/plato/tools/convnet/convnet.py
@@ -8,7 +8,7 @@
 from plato.interfaces.interfaces import IParameterized
 from plato.tools.common.online_predictors import FeedForwardModule
 from plato.tools.convnet.conv_specifiers import ConvInitSpec, ConvolverSpec, PoolerSpec, NonlinearitySpec, DropoutSpec, \
-    FullyConnectedSpec
+    FullyConnectedSpec, ConvNetSpec
 from theano.tensor.signal.pool import pool_2d
 __author__ = 'peter'
 import logging
@@ -125,6 +125,9 @@ def test_call(self, x):
     def to_spec(self):
         return DropoutSpec(self.dropout_rate)
 
+    @classmethod
+    def from_spec(cls, spec):
+        return cls(spec.dropout_rate, rng=rng)
 
 @symbolic
 class FullyConnectedLayer(FeedForwardModule):
@@ -150,6 +153,10 @@ def parameters(self):
     def to_spec(self):
         return FullyConnectedSpec(w=self.w.get_value(), b=self.b.get_value() if self.b is not False else False)
 
+    @classmethod
+    def from_spec(cls, spec):
+        return FullyConnectedLayer(w=spec.w, b=spec.b)
+
 
 @symbolic
 class ConvNet(IParameterized):
@@ -191,9 +198,9 @@ def get_named_layer_activations(self, x, include_input = False, test_call=False)
         for name, layer in self.layers.iteritems():
             x = layer.test_call(x) if test_call else layer.train_call(x)
             named_activations[name] = x
-            if isinstance(layer, ConvLayer):
-                tdbprint(abs(layer.w).mean(), 'Mean Magnitude of w of layer {}'.format(name))
-                tdbprint(abs(layer.b).mean(), 'Mean Magnitude of b of layer {}'.format(name))
+            # if isinstance(layer, ConvLayer):
+            #     tdbprint(abs(layer.w).mean(), 'Mean Magnitude of w of layer {}'.format(name))
+            #     tdbprint(abs(layer.b).mean(), 'Mean Magnitude of b of layer {}'.format(name))
         return named_activations
 
     @staticmethod
@@ -239,10 +246,15 @@ def parameters(self):
         return sum([l.parameters if isinstance(l, IParameterized) else [] for l in self.layers.values()], [])
 
     def to_spec(self):
-        return OrderedDict((layer_name, lay.to_spec()) for layer_name, lay in self.layers.iteritems())
+        return ConvNetSpec(OrderedDict((layer_name, lay.to_spec()) for layer_name, lay in self.layers.iteritems()))
+
+    @classmethod
+    def from_spec(cls, spec):
+        return ConvNet.from_init(spec)
 
 
 def specifier_to_layer(spec, force_shared_parameters=True, rng = None):
+    # TODO: Remove, replace with from_spec
     return {
         ConvolverSpec: lambda: ConvLayer(
             w=spec.w,

From c3fa9d3eedaca69e7b209d61a491befe938a4693 Mon Sep 17 00:00:00 2001
From: Peter O'Connor <peter.ed.oconnor@gmail.com>
Date: Wed, 31 May 2017 16:46:32 +0200
Subject: [PATCH 02/29] temporary fix

---
 plato/tools/convnet/convnet.py | 54 ++++++++++++++++++++++++++--------
 1 file changed, 42 insertions(+), 12 deletions(-)

diff --git a/plato/tools/convnet/convnet.py b/plato/tools/convnet/convnet.py
index 4ccd9fb..f6f0ca6 100644
--- a/plato/tools/convnet/convnet.py
+++ b/plato/tools/convnet/convnet.py
@@ -48,6 +48,15 @@ def parameters(self):
     def to_spec(self):
         return ConvolverSpec(self.w.get_value(), self.b.get_value() if self.b is not False else False, self.border_mode)
 
+    @classmethod
+    def from_spec(cls, spec):
+        return ConvLayer(
+            w=spec.w,
+            b=spec.b,
+            border_mode= {'full': 0, 'same': 1, 'valid': 0}[spec.mode] if spec.mode in ('full', 'same', 'valid') else spec.mode,
+            filter_flip=False
+            )
+
 
 @symbolic
 class Nonlinearity(FeedForwardModule):
@@ -66,6 +75,10 @@ def to_spec(self):
         assert isinstance(self._activation_name, basestring), "Can't identify activation fcn"
         return NonlinearitySpec(self._activation_name)
 
+    @classmethod
+    def from_spec(cls, spec):
+        return Nonlinearity(spec.func)
+
 
 @symbolic
 class Pooler(FeedForwardModule):
@@ -97,6 +110,9 @@ def __call__(self, x):
     def to_spec(self):
         return PoolerSpec(region = self.region, stride=self.stride, mode=self.mode)
 
+    @classmethod
+    def from_spec(cls, spec):
+        return Pooler(region=spec.region, stride=spec.stride, mode=spec.mode)
 
 @symbolic
 class DropoutLayer(FeedForwardModule):
@@ -204,7 +220,7 @@ def get_named_layer_activations(self, x, include_input = False, test_call=False)
         return named_activations
 
     @staticmethod
-    def from_init(specifiers, input_shape, w_init=0.01, force_shared_parameters = True, rng=None):
+    def from_init(specifiers, input_shape=None, w_init=0.01, force_shared_parameters = True, rng=None):
         """
         Convenient initialization function.
         :param specifiers: List/OrderedDict of layer speciefier objects (see conv_specifiers.py)
@@ -215,12 +231,13 @@ def from_init(specifiers, input_shape, w_init=0.01, force_shared_parameters = Tr
         :return: A ConvNet
         """
         rng = get_rng(rng)
-        n_maps, n_rows, n_cols = input_shape
+        n_maps, n_rows, n_cols = input_shape if input_shape is not None else (None, None, None)
         layers = OrderedDict()
         if isinstance(specifiers, (list, tuple)):
             specifiers = OrderedDict((str(i), val) for i, val in enumerate(specifiers))
         for spec_name, spec in specifiers.iteritems():
             if isinstance(spec, ConvInitSpec):
+                assert n_maps is not None
                 spec = ConvolverSpec(
                     w=w_init*rng.randn(spec.n_maps, n_maps, spec.filter_size[0], spec.filter_size[1]),
                     b=np.zeros(spec.n_maps) if spec.use_bias else False,
@@ -228,15 +245,15 @@ def from_init(specifiers, input_shape, w_init=0.01, force_shared_parameters = Tr
                     )
             if isinstance(spec, ConvolverSpec):
                 n_maps = spec.w.shape[0]
-                if spec.mode == 'valid':
-                    n_rows += -spec.w.shape[2] + 1
-                    n_cols += -spec.w.shape[3] + 1
-                elif isinstance(spec.mode, int):
-                    n_rows += -spec.w.shape[2] + 1 + spec.mode*2
-                    n_cols += -spec.w.shape[3] + 1 + spec.mode*2
-            elif isinstance(spec, PoolerSpec):
-                n_rows /= spec.region[0]
-                n_cols /= spec.region[1]
+            #     if spec.mode == 'valid':
+            #         n_rows += -spec.w.shape[2] + 1
+            #         n_cols += -spec.w.shape[3] + 1
+            #     elif isinstance(spec.mode, int):
+            #         n_rows += -spec.w.shape[2] + 1 + spec.mode*2
+            #         n_cols += -spec.w.shape[3] + 1 + spec.mode*2
+            # elif isinstance(spec, PoolerSpec):
+            #     n_rows /= spec.region[0]
+            #     n_cols /= spec.region[1]
             layers[spec_name] = specifier_to_layer(spec, force_shared_parameters=force_shared_parameters, rng=rng)
             # LOGGER.info('Layer "%s" (%s) output shape: %s' % (spec_name, spec.__class__.__name__, (n_maps, n_rows, n_cols)))
         return ConvNet(layers)
@@ -250,7 +267,7 @@ def to_spec(self):
 
     @classmethod
     def from_spec(cls, spec):
-        return ConvNet.from_init(spec)
+        return ConvNet.from_init(spec.layer_ordered_dict)
 
 
 def specifier_to_layer(spec, force_shared_parameters=True, rng = None):
@@ -287,3 +304,16 @@ def normalize_convnet(convnet, inputs):
             cum_scale = this_std / cum_scale
             convnet.layers[name].w.set_value(convnet.layers[name].w.get_value()/cum_scale)
             convnet.layers[name].b.set_value(convnet.layers[name].b.get_value()/this_std)
+
+
+def spec_to_object(spec):  # Temporary measure... will do this more clanly later.
+    cls = {
+        ConvNetSpec: ConvNet,
+        NonlinearitySpec: Nonlinearity,
+        PoolerSpec: Pooler,
+        FullyConnectedSpec: FullyConnectedLayer,
+        ConvolverSpec: ConvLayer,
+        DropoutLayer: DropoutLayer
+        }[spec.__class__]\
+
+    return cls.from_spec(spec)

From 77845945e33f39a1d9928e56dd4ff354851024c2 Mon Sep 17 00:00:00 2001
From: Peter O'Connor <peter.ed.oconnor@gmail.com>
Date: Fri, 16 Jun 2017 16:12:06 +0200
Subject: [PATCH 03/29] correction

---
 plato/tools/lstm/long_short_term_memory.py | 34 ++++++++++++++++------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/plato/tools/lstm/long_short_term_memory.py b/plato/tools/lstm/long_short_term_memory.py
index dfe64d7..7a2bd38 100644
--- a/plato/tools/lstm/long_short_term_memory.py
+++ b/plato/tools/lstm/long_short_term_memory.py
@@ -1,6 +1,8 @@
-from plato.core import add_update, symbolic_multi, symbolic_simple, create_shared_variable
+from artemis.general.numpy_helpers import get_rng
+from artemis.ml.tools.neuralnets import initialize_weight_matrix
+from plato.core import add_update, symbolic_multi, symbolic_simple
 from plato.interfaces.decorators import symbolic_updater
-from plato.interfaces.helpers import create_shared_variable, get_theano_rng, get_named_activation_function, softmax
+from plato.interfaces.helpers import create_shared_variable, get_theano_rng, get_named_activation_function
 from plato.tools.optimization.cost import mean_xe
 from plato.tools.optimization.optimizers import AdaMax
 import theano
@@ -98,7 +100,7 @@ def parameters(self):
             self.w_ho, self.w_co, self.b_i, self.b_f, self.b_c, self.b_o]
     
     @classmethod
-    def from_initializer(cls, n_input, n_hidden, initializer_fcn, hidden_layer_type='tanh'):
+    def from_initializer(cls, n_input, n_hidden, initializer_fcn = 'xavier', hidden_layer_type='tanh', rng=None):
         """
         :param n_input: Number of inputs
         :param n_hidden: Number of hiddens
@@ -106,6 +108,11 @@ def from_initializer(cls, n_input, n_hidden, initializer_fcn, hidden_layer_type=
         :param initializer_fcn: Function taking a shape and returning parameters.
         :return: An LSTMLayer
         """
+        if isinstance(initializer_fcn, basestring):
+            rng = get_rng(rng)
+            initializer = initializer_fcn
+            initializer_fcn = lambda (n_in, n_out): initialize_weight_matrix(n_in=n_in, n_out=n_out, mag=initializer, rng=rng)
+
         return LSTMLayer(
             w_xi = create_shared_variable(initializer_fcn, shape = (n_input, n_hidden)),
             w_xf = create_shared_variable(initializer_fcn, shape = (n_input, n_hidden)),
@@ -128,10 +135,15 @@ class AutoencodingLSTM(object):
     """
     An LSTM that learns to predict the next element in a sequence.
     """
-    def __init__(self, n_input, n_hidden, initializer_fcn, input_layer_type = 'softmax', hidden_layer_type = 'tanh'):
+    def __init__(self, n_input, n_hidden, initializer_fcn='xavier', input_layer_type = 'softmax', hidden_layer_type = 'tanh', rng=None):
+
+        if isinstance(initializer_fcn, basestring):
+            rng = get_rng(rng)
+            initializer = initializer_fcn
+            initializer_fcn = lambda (n_in, n_out): initialize_weight_matrix(n_in=n_in, n_out=n_out, mag=initializer, rng=rng)
 
         self.lstm = LSTMLayer.from_initializer(n_input=n_input, n_hidden=n_hidden, initializer_fcn=initializer_fcn,
-            hidden_layer_type = hidden_layer_type)
+            hidden_layer_type = hidden_layer_type, rng=rng)
         self.w_hz = create_shared_variable(initializer_fcn, (n_hidden, n_input))
         self.b_z = create_shared_variable(0, n_input)
 
@@ -159,7 +171,7 @@ def get_generation_function(self, maintain_state = True, stochastic = True, rng
         """
         Return a symbolic function that generates a sequence (and updates its internal state).
         :param stochastic: True to sample a onehot-vector from the output.  False to simply reinsert the
-            distribution vector.
+            distribution vector (only makes sense on categorical variables, not regression).
         :param rng: A seed, numpy or theano random number generator
         :return: A symbolic function of the form:
             (outputs, updates) = generate(primer, n_steps)
@@ -190,7 +202,11 @@ def do_step(i, x_, h_, c_):
                 c_: A memory cell vector
                 """
                 y_prob, h, c = self.step(x_, h_, c_)
-                y_candidate = ifelse(int(stochastic), rng.multinomial(n=1, pvals=y_prob[None, :])[0].astype(theano.config.floatX), y_prob)
+                if stochastic:
+                    y_candidate = rng.multinomial(n=1, pvals=y_prob[None, :])[0].astype(theano.config.floatX)
+                else:
+                    y_candidate = y_prob
+                # y_candidate = ifelse(int(stochastic), rng.multinomial(n=1, pvals=y_prob[None, :])[0].astype(theano.config.floatX), y_prob)
                 # y_candidate = ifelse(int(stochastic), rng.multinomial(n=1, pvals=y_prob.dimshuffle('x', 1))[0].astype(theano.config.floatX), y_prob)
                 y = ifelse(i < n_primer_steps, primer[i], y_candidate)  # Note: If you get error here, you just need to prime with something on first call.
                 return y, h, c
@@ -235,8 +251,8 @@ def parameters(self):
 def mysoftmax(x):
     # A little kludge we have to do because the build-in softmax is awkwardly restricted to being along the first
     # axis.
-    if x.indim==1:
+    if x.ndim==1:
         newx = x.dimshuffle('x', 0)
         return tt.nnet.softmax(newx)[0]
-    elif x.indim==2:
+    elif x.ndim==2:
         return tt.nnet.softmax(x)

From 4e3b8cb15ced8ca8f05bb192a35a013b78237514 Mon Sep 17 00:00:00 2001
From: Peter O'Connor <peter.ed.oconnor@gmail.com>
Date: Fri, 30 Jun 2017 12:09:25 +0200
Subject: [PATCH 04/29] added feedback alignment net

---
 plato/tools/fa/__init__.py                | 0
 plato/tools/fa/demo_feedback_alignment.py | 2 ++
 2 files changed, 2 insertions(+)
 create mode 100644 plato/tools/fa/__init__.py
 create mode 100644 plato/tools/fa/demo_feedback_alignment.py

diff --git a/plato/tools/fa/__init__.py b/plato/tools/fa/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/plato/tools/fa/demo_feedback_alignment.py b/plato/tools/fa/demo_feedback_alignment.py
new file mode 100644
index 0000000..139597f
--- /dev/null
+++ b/plato/tools/fa/demo_feedback_alignment.py
@@ -0,0 +1,2 @@
+
+

From e4a14813416834ed9daf979193788901cdc0509d Mon Sep 17 00:00:00 2001
From: Peter O'Connor <peter.ed.oconnor@gmail.com>
Date: Sat, 1 Jul 2017 17:53:59 +0100
Subject: [PATCH 05/29] eeh

---
 plato/tools/mlp/manual_backprop_net.py      | 152 ++++++++++++++++++++
 plato/tools/mlp/test_manual_backprop_net.py |  55 +++++++
 2 files changed, 207 insertions(+)
 create mode 100644 plato/tools/mlp/manual_backprop_net.py
 create mode 100644 plato/tools/mlp/test_manual_backprop_net.py

diff --git a/plato/tools/mlp/manual_backprop_net.py b/plato/tools/mlp/manual_backprop_net.py
new file mode 100644
index 0000000..58aa72c
--- /dev/null
+++ b/plato/tools/mlp/manual_backprop_net.py
@@ -0,0 +1,152 @@
+from abc import abstractmethod
+
+import numpy as np
+from artemis.general.should_be_builtins import izip_equal
+from plato.core import create_constant, symbolic
+from plato.interfaces.helpers import batchify_function, get_named_activation_function
+from plato.interfaces.interfaces import IParameterized
+from plato.tools.common.online_predictors import ISymbolicPredictor
+from plato.tools.mlp.mlp import FullyConnectedTransform
+from plato.tools.optimization.cost import get_named_cost_function
+from plato.tools.optimization.optimizers import IGradientOptimizer
+from theano import tensor as tt
+
+
+class ManualBackpropNet(ISymbolicPredictor):
+
+    def __init__(self, layers, optimizer, loss, prediction_minibatch_size=None, pass_loss = True):
+        """
+        :param layrs:
+        :param optimizer:
+        :param loss:
+        """
+        self.layers = layers
+        self.optimizer = optimizer
+        self.pass_loss = pass_loss
+        self.loss = get_named_cost_function(loss) if isinstance(loss, basestring) else loss
+        self.prediction_minibatch_size = prediction_minibatch_size
+
+    @symbolic
+    def predict(self, x):
+        if self.prediction_minibatch_size is None:
+            return self._predict_in_single_pass(x)
+        else:
+            return batchify_function(self._predict_in_single_pass, batch_size=self.prediction_minibatch_size)(x)
+
+    def _predict_in_single_pass(self, x):
+        for i, layer in enumerate(self.layers):
+            x = layer.forward_pass(x)
+        return x
+
+    @symbolic
+    def _predict_minibatch(self, start, end, x):
+        return self.predict(x[start:end], _single_pass=True)
+
+    @symbolic
+    def train(self, x, y):
+        states = {}
+        for layer in self.layers:
+            x, layer_state = layer.forward_pass_and_state(x)
+            states[layer]=layer_state
+        layerwise_param_grad_pairs = []
+        loss = self.loss(x, y)
+        if self.pass_loss:
+            grad = None
+        else:
+            grad = tt.grad(loss, wrt=x)
+            loss = None
+        for layer in self.layers[::-1]:
+            grad, param_grads = layer.backward_pass(state=states[layer], grad=grad, cost = loss)
+            loss = None
+            layerwise_param_grad_pairs.append(list(izip_equal(layer.parameters, param_grads)))
+        if isinstance(self.optimizer, IGradientOptimizer):
+            all_params, all_param_grads = zip(*[(p, g) for layer_pairs in layerwise_param_grad_pairs for p, g in layer_pairs])
+            self.optimizer.update_from_gradients(parameters=all_params, gradients=all_param_grads)
+        elif isinstance(self.optimizer, (list, tuple)):
+            for optimizer, layer_pairs in izip_equal(self.optimizer, layerwise_param_grad_pairs):
+                params, grads = zip(*layer_pairs)
+                optimizer.update_from_gradients(parameters=params, gradients=grads)
+        return create_constant(0.)  # scan demands some return
+
+    @property
+    def parameters(self):
+        return [p for layer in self.layers for p in layer.parameters]
+
+
+class IManualBackpropLayer(IParameterized):
+
+    def forward_pass(self, x):
+        """
+        :param x: A real (n_samples, n_dims_in) input
+        :return: A real (n_samples, n_dims_in) output
+        """
+        out, _ = self.forward_pass_and_state(x)
+        return out
+
+    @abstractmethod
+    def forward_pass_and_state(self, x):
+        """
+        :param x:
+        :return: out, state
+            Where:
+                out is the output of the layer
+                state is a list of state-variables to be passed into the backward pass.
+                Importantly, they must be in order (so that the last element of state is the one used to compute the gradient)
+        """
+
+
+    # def backward_pass_from_loss(self, state, loss):
+    #     grad = tt.grad(loss, wrt=state[-1])
+    #     return self.backward_pass_from_grad(state, grad)
+
+    @abstractmethod
+    def backward_pass(self, state, grad, cost):
+        """
+        :param state: The list of state variables you returned in forward_pass_and_state
+        :param grad: The incoming gradient
+        :return: The outgoing gradient
+        """
+
+
+class ExactBackpropLayer(IManualBackpropLayer):
+    """
+    Performs the function of a layer.
+    """
+
+    def __init__(self, linear_transform, nonlinearity):
+        """
+        linear_transform: Can be:
+            A callable (e.g. FullyConnectedBridge/ConvolutionalBridge) which does a linear transform on the data.
+            A numpy array - in which case it will be used to instantiate a linear transform.
+        """
+        if isinstance(linear_transform, np.ndarray):
+            assert (linear_transform.ndim == 2 and nonlinearity!='maxout') or (linear_transform.ndim == 3 and nonlinearity=='maxout'), \
+                'Your weight matrix must be 2-D (or 3-D if you have maxout units)'
+            linear_transform = FullyConnectedTransform(w=linear_transform)
+        if isinstance(nonlinearity, str):
+            nonlinearity = get_named_activation_function(nonlinearity)
+        self.linear_transform = linear_transform
+        self.nonlinearity = nonlinearity
+
+    def forward_pass_and_state(self, x):
+        pre_sig = self.linear_transform(x)
+        return self.nonlinearity(pre_sig), (x, pre_sig, )
+
+    def backward_pass(self, state, grad, cost):
+        x, _ = state
+        if cost is None:
+            y, (x, pre_sig) = self.forward_pass_and_state(x)
+            dydp = tt.grad(y.sum(), wrt=pre_sig)
+            # Note... we rely on the (linear-transform, pointwise-nonlinearity) design here.  We should figure out how
+            # to do it more generally (maybe using tt.jacobian), or somehow making a virtual cost.
+            dcdp = grad*dydp
+            dcdw = x.T.dot(dcdp)  # Because I think if we did this directly for the ws we'd be in trouble
+            dcdb = dcdp.sum(axis=0)
+            dcdx = dcdp.dot(self.linear_transform.w.T)
+            return dcdx, [dcdw, dcdb]
+        else:
+            return tt.grad(cost, wrt=x), tt.grad(cost, wrt=self.linear_transform.parameters)
+
+    @property
+    def parameters(self):
+        return self.linear_transform.parameters
diff --git a/plato/tools/mlp/test_manual_backprop_net.py b/plato/tools/mlp/test_manual_backprop_net.py
new file mode 100644
index 0000000..d606a3d
--- /dev/null
+++ b/plato/tools/mlp/test_manual_backprop_net.py
@@ -0,0 +1,55 @@
+import numpy as np
+from artemis.ml.tools.neuralnets import initialize_network_params
+from plato.tools.common.online_predictors import GradientBasedPredictor
+from plato.tools.mlp.manual_backprop_net import ManualBackpropNet, ExactBackpropLayer
+from plato.tools.mlp.mlp import MultiLayerPerceptron
+from plato.tools.optimization.optimizers import GradientDescent
+
+
+def test_exact_manual_backprop_net():
+
+    rng = np.random.RandomState(1234)
+
+    n_samples = 5
+    n_in, n_hid1, n_hid2, n_out = 10, 8, 7, 6
+    ws = initialize_network_params(layer_sizes=[n_in, n_hid1, n_hid2, n_out], include_biases=False)
+    x, y = rng.randn(n_samples, n_in), rng.randn(n_samples, n_out)
+
+    auto_mlp = GradientBasedPredictor(
+        function = MultiLayerPerceptron.from_weights(weights=ws, hidden_activations='relu', output_activation='linear'),
+        cost_function='softmax-xe',
+        optimizer=GradientDescent(0.1)
+        )
+    stick_mlp = ManualBackpropNet(
+        layers = [ExactBackpropLayer(ws[0], 'relu'), ExactBackpropLayer(ws[1], 'relu'), ExactBackpropLayer(ws[2], 'linear')],
+        optimizer = GradientDescent(0.1),
+        loss = 'softmax-xe'
+        )
+
+    # Check forward passes match
+    fp_auto = auto_mlp.predict.compile()
+    fp_stick = stick_mlp.predict.compile()
+    out_auto = fp_auto(x)
+    out_stick = fp_stick(x)
+    assert np.allclose(out_auto, out_stick)
+
+    # 1 Iteration of training
+    ft_auto = auto_mlp.train.compile()
+    ft_stick = stick_mlp.train.compile()
+    ft_auto(x, y)
+    ft_stick(x, y)
+
+    # Check parameter changes match
+    dw0_auto = auto_mlp._function.layers[0].linear_transform.w.get_value() - ws[0]
+    dw0_stick = stick_mlp.layers[0].linear_transform.w.get_value() - ws[0]
+    assert np.allclose(dw0_auto, dw0_stick)
+
+    # Check outputs match
+    new_out_auto = fp_auto(x)
+    new_out_stick = fp_stick(x)
+    assert np.allclose(new_out_auto, new_out_stick)
+    assert not np.allclose(new_out_stick, out_auto)
+
+
+if __name__ == '__main__':
+    test_exact_manual_backprop_net()

From 7ce83633805a50473f33c0a17dc894adf670eff7 Mon Sep 17 00:00:00 2001
From: Peter O'Connor <peter.ed.oconnor@gmail.com>
Date: Fri, 28 Jul 2017 18:24:37 +0200
Subject: [PATCH 06/29] dfdsfd

---
 plato/test_core.py                          |  4 +-
 plato/tools/common/config.py                | 15 +++-
 plato/tools/convnet/convnet.py              |  5 +-
 plato/tools/fa/demo_feedback_alignment.py   | 88 +++++++++++++++++++++
 plato/tools/fa/direct_feedback_alignment.py | 80 +++++++++++++++++++
 plato/tools/fa/feedback_alignment.py        | 71 +++++++++++++++++
 plato/tools/misc/tdb_plotting.py            |  3 +-
 plato/tools/mlp/demo_mnist_mlp.py           | 19 +++--
 plato/tools/mlp/manual_backprop_net.py      |  4 -
 9 files changed, 273 insertions(+), 16 deletions(-)
 create mode 100644 plato/tools/fa/direct_feedback_alignment.py
 create mode 100644 plato/tools/fa/feedback_alignment.py

diff --git a/plato/test_core.py b/plato/test_core.py
index ae9f7c6..f140be9 100644
--- a/plato/test_core.py
+++ b/plato/test_core.py
@@ -2,7 +2,7 @@
 
 from artemis.general.hashing import compute_fixed_hash, fixed_hash_eq
 from plato.interfaces.helpers import create_shared_variable
-from plato.tools.common.config import float_precision
+from plato.tools.common.config import hold_float_precision
 from pytest import raises
 from plato.core import symbolic_simple, symbolic_updater, SymbolicFormatError, \
     tdb_trace, get_tdb_traces, symbolic, set_enable_omniscence, EnableOmniscence, clear_tdb_traces, add_update, \
@@ -516,7 +516,7 @@ def do_some_ops(x):
 
 def test_arbitrary_structures():
 
-    with float_precision(64):
+    with hold_float_precision(64):
         @symbolic
         def my_func(inp):
             """
diff --git a/plato/tools/common/config.py b/plato/tools/common/config.py
index d55c7ef..160ec7d 100644
--- a/plato/tools/common/config.py
+++ b/plato/tools/common/config.py
@@ -5,7 +5,7 @@
 
 
 @contextmanager
-def float_precision(value):
+def hold_float_precision(value):
     """
     Change the theano float precesion variable (theano.config.floatX) for all code in a context.  Temporarily overrides
     the value defined in .theanorc.
@@ -25,3 +25,16 @@ def float_precision(value):
     theano.config.floatX = value
     yield
     theano.config.floatX = old_precision
+
+
+float_precision = hold_float_precision # Back-compatibility
+
+
+@contextmanager
+def hold_theano_optimizer(value):
+    if value is None:
+        value = 'None'
+    old_val = theano.config.optimizer
+    theano.config.optimizer = value
+    yield
+    theano.config.optimizer = old_val
diff --git a/plato/tools/convnet/convnet.py b/plato/tools/convnet/convnet.py
index 030e9e6..7537aa2 100644
--- a/plato/tools/convnet/convnet.py
+++ b/plato/tools/convnet/convnet.py
@@ -254,7 +254,10 @@ def to_spec(self):
 
     @classmethod
     def from_spec(cls, spec):
-        return ConvNet.from_init(spec.layer_ordered_dict)
+        if isinstance(spec, OrderedDict): # "old" format
+            return ConvNet.from_init(spec)
+        else:
+            return ConvNet.from_init(spec.layer_ordered_dict)
 
 
 def specifier_to_layer(spec, force_shared_parameters=True, rng = None):
diff --git a/plato/tools/fa/demo_feedback_alignment.py b/plato/tools/fa/demo_feedback_alignment.py
index 139597f..f39d3f2 100644
--- a/plato/tools/fa/demo_feedback_alignment.py
+++ b/plato/tools/fa/demo_feedback_alignment.py
@@ -1,2 +1,90 @@
+from artemis.experiments.experiment_record import experiment_root, capture_created_experiments, ExperimentFunction
+from artemis.experiments.ui import browse_experiments
+from artemis.general.numpy_helpers import get_rng
+from artemis.ml.datasets.mnist import get_mnist_dataset
+from artemis.ml.predictors.train_and_test import train_and_test_online_predictor
+from plato.tools.common.online_predictors import GradientBasedPredictor
+from plato.tools.fa.direct_feedback_alignment import create_direct_feedback_alignment_net
+from plato.tools.fa.feedback_alignment import create_feedback_alignment_net
+from plato.tools.mlp.mlp import MultiLayerPerceptron
+from plato.tools.optimization.optimizers import GradientDescent
 
 
+def create_network(version, layer_sizes, optimizer, nonlinearity, final_nonlinearity, backwards_nonlinearity, loss,
+                   w_init = 'xavier', rng=None):
+
+    if version == 'fa':
+        return create_feedback_alignment_net(layer_sizes=layer_sizes, optimizer=optimizer, backwards_nonlinearity=backwards_nonlinearity,
+            nonlinearity=nonlinearity, final_nonlinearity=final_nonlinearity, loss=loss, w_init=w_init, rng=rng)
+    elif version == 'dfa':
+        return create_direct_feedback_alignment_net(layer_sizes=layer_sizes, optimizer=optimizer, backwards_nonlinearity=backwards_nonlinearity,
+            nonlinearity=nonlinearity, final_nonlinearity=final_nonlinearity, loss=loss, w_init=w_init,  rng=rng)
+    elif version == 'mlp':
+        return GradientBasedPredictor(
+            function=MultiLayerPerceptron.from_init(
+                layer_sizes=layer_sizes,
+                hidden_activations = nonlinearity,
+                output_activation = final_nonlinearity,
+                w_init=w_init,
+                rng=rng,
+                ),
+
+            optimizer=optimizer,
+            cost_function=loss,
+            )
+    else:
+        raise Exception('No network version "{}"'.format(version))
+
+
+@ExperimentFunction(is_root=True, one_liner_results=lambda scores: scores.get_oneliner())
+def demo_feedback_alignment_mnist(
+            version = 'fa',
+            hidden_sizes = [100],
+            nonlinearity = 'relu',
+            final_nonlinearity = 'linear',
+            loss = 'logistic-xe',
+            backwards_nonlinearity = 'deriv',
+            n_epochs=10,
+            minibatch_size = 10,
+            learning_rate = 0.01,
+            seed = 1234,
+            ):
+
+    assert version in ('fa', 'dfa', 'mlp')
+    rng = get_rng(seed)
+    mnist = get_mnist_dataset(flat=True).to_onehot()
+
+    nnet = create_network(
+        version=version,
+        layer_sizes=[mnist.input_size]+hidden_sizes+[10],
+        optimizer=GradientDescent(learning_rate),
+        backwards_nonlinearity=backwards_nonlinearity, nonlinearity=nonlinearity, final_nonlinearity=final_nonlinearity, loss=loss, rng=rng
+        )
+
+    training_info = train_and_test_online_predictor(
+        dataset = mnist,
+        train_fcn=nnet.train.compile(add_test_values = True),
+        predict_fcn=nnet.predict.compile(add_test_values = True),
+        minibatch_size=minibatch_size,
+        n_epochs=n_epochs,
+        test_epochs=('every', 0.5),
+        )
+
+    return training_info
+
+
+with capture_created_experiments() as exs:
+    demo_feedback_alignment_mnist.add_variant(version='fa')
+    demo_feedback_alignment_mnist.add_variant(version='dfa')
+    demo_feedback_alignment_mnist.add_variant(version='mlp')
+
+for e in exs:
+    e.add_variant(hidden_sizes=[200, 200, 200], n_epochs = 50)
+for e in exs:
+    e.add_variant(hidden_sizes=[500, 500, 500, 500, 500], n_epochs = 50)
+
+
+if __name__ == '__main__':
+    browse_experiments()
+    # demo_feedback_alignment_mnist(version = 'dfa', hidden_sizes=[200, 200, 200])
+
diff --git a/plato/tools/fa/direct_feedback_alignment.py b/plato/tools/fa/direct_feedback_alignment.py
new file mode 100644
index 0000000..fbdfbd3
--- /dev/null
+++ b/plato/tools/fa/direct_feedback_alignment.py
@@ -0,0 +1,80 @@
+import numpy as np
+from artemis.general.numpy_helpers import get_rng
+from artemis.general.should_be_builtins import izip_equal
+from artemis.ml.tools.neuralnets import initialize_weight_matrix
+from plato.core import create_shared_variable
+from plato.interfaces.helpers import get_named_activation_function, get_named_activation_function_derivative
+from plato.tools.mlp.manual_backprop_net import IManualBackpropLayer, ManualBackpropNet
+import theano.tensor as tt
+
+class DirectFeedbackAlignmentLayer(IManualBackpropLayer):
+    """
+
+
+
+    """
+
+    def __init__(self, w, w_back, nonlinearity, b=None, backwards_nonlinearity = 'deriv'):
+        self.n_in, self.n_out = w.shape
+
+        self.w = create_shared_variable(w)
+        self.b = create_shared_variable(np.zeros(w.shape[1]) if b is None else b)
+        if w_back is None:
+            self.w_back = None
+        else:
+            assert w_back.shape[1] == self.n_out
+            self.w_back = create_shared_variable(w_back)
+
+        self.nonlinearity = get_named_activation_function(nonlinearity) if isinstance(nonlinearity, str) else nonlinearity
+        self.backwards_nonlinearity = \
+            get_named_activation_function_derivative(nonlinearity) if backwards_nonlinearity=='deriv' else \
+            get_named_activation_function(backwards_nonlinearity) if isinstance(backwards_nonlinearity, basestring) else \
+            backwards_nonlinearity
+
+    @property
+    def parameters(self):
+        return [self.w, self.b]
+
+    def forward_pass_and_state(self, x):
+        pre_sig = x.dot(self.w)
+        return self.nonlinearity(pre_sig), (x, pre_sig, )
+
+    def backward_pass(self, state, grad, cost):
+        # assert cost is None, 'You need to initialize the outer network with pass_loss = False'
+        x, pre_sig = state
+        if cost is not None:  # Just top layer
+            assert self.w_back is None
+            grad = this_grad = tt.grad(cost, wrt=pre_sig)
+        else:  # Other layers
+            this_grad = grad.dot(self.w_back)
+        grad_presig = this_grad * self.backwards_nonlinearity(pre_sig)
+        return grad, [x.T.dot(grad_presig), grad_presig.mean(axis=0)]
+
+    @classmethod
+    def from_init(cls, n_in, n_out, n_final, w_init='xavier', rng=None, **kwargs):
+        rng = get_rng(rng)
+        w = initialize_weight_matrix(n_in=n_in, n_out=n_out, mag=w_init, rng=rng)
+        w_back = None if n_final is None else initialize_weight_matrix(n_in=n_final, n_out=n_out, mag=w_init, rng=rng)
+        return cls(w=w, w_back=w_back, **kwargs)
+
+
+def create_direct_feedback_alignment_net(layer_sizes, nonlinearity, final_nonlinearity, optimizer, loss,
+        backwards_nonlinearity='deriv', w_init = 'xavier', rng = None):
+
+    rng = get_rng(rng)
+    return ManualBackpropNet(
+        layers = [
+            DirectFeedbackAlignmentLayer.from_init(
+                n_in=n_in,
+                n_out=n_out,
+                n_final=layer_sizes[-1] if i < len(layer_sizes)-2 else None,
+                nonlinearity=nonlinearity if i < len(layer_sizes)-2 else final_nonlinearity,
+                backwards_nonlinearity = backwards_nonlinearity,
+                w_init = w_init,
+                rng=rng
+                ) for i, (n_in, n_out) in enumerate(izip_equal(layer_sizes[:-1], layer_sizes[1:]))
+            ],
+        optimizer=optimizer,
+        pass_loss=True,
+        loss=loss
+        )
diff --git a/plato/tools/fa/feedback_alignment.py b/plato/tools/fa/feedback_alignment.py
new file mode 100644
index 0000000..1d171f0
--- /dev/null
+++ b/plato/tools/fa/feedback_alignment.py
@@ -0,0 +1,71 @@
+import numpy as np
+from artemis.general.numpy_helpers import get_rng
+from artemis.general.should_be_builtins import izip_equal
+from artemis.ml.tools.neuralnets import initialize_weight_matrix
+from plato.core import create_shared_variable
+from plato.interfaces.helpers import get_named_activation_function, get_named_activation_function_derivative
+from plato.tools.mlp.manual_backprop_net import IManualBackpropLayer, ManualBackpropNet
+from theano import tensor as tt
+
+
+class FeedbackAlignmentLayer(IManualBackpropLayer):
+
+    def __init__(self, w, w_back, nonlinearity, b=None, backwards_nonlinearity = 'deriv'):
+        self.n_in, self.n_out = w.shape
+
+        assert w_back.shape == (self.n_out, self.n_in)
+        self.w = create_shared_variable(w)
+        self.b = create_shared_variable(np.zeros(w.shape[1]) if b is None else b)
+        self.w_back = create_shared_variable(w_back)
+
+        self.nonlinearity = get_named_activation_function(nonlinearity) if isinstance(nonlinearity, str) else nonlinearity
+        self.backwards_nonlinearity = \
+            get_named_activation_function_derivative(nonlinearity) if backwards_nonlinearity=='deriv' else \
+            get_named_activation_function(backwards_nonlinearity) if isinstance(backwards_nonlinearity, basestring) else \
+            backwards_nonlinearity
+
+    @property
+    def parameters(self):
+        return [self.w, self.b]
+
+    def forward_pass_and_state(self, x):
+        pre_sig = x.dot(self.w)
+        out = self.nonlinearity(pre_sig)
+        return out, (x, pre_sig, out)
+
+    def backward_pass(self, state, grad, cost):
+        x, pre_sig, out = state
+        if grad is None:
+            grad_presig = tt.grad(cost, wrt = pre_sig)
+        else:
+            # return self.backward_pass(state=state, grad=grad, cost=None)
+            assert cost is None and grad is not None
+            grad_presig = grad * self.backwards_nonlinearity(pre_sig)
+        return grad_presig.dot(self.w_back), [x.T.dot(grad_presig), grad_presig.mean(axis=0)]
+
+    @classmethod
+    def from_init(cls, n_in, n_out, w_init='xavier', rng=None, **kwargs):
+        rng = get_rng(rng)
+        w = initialize_weight_matrix(n_in=n_in, n_out=n_out, mag=w_init, rng=rng)
+        w_back = initialize_weight_matrix(n_in=n_out, n_out=n_in, mag=w_init, rng=rng)
+        return cls(w=w, w_back=w_back, **kwargs)
+
+
+def create_feedback_alignment_net(layer_sizes, nonlinearity, final_nonlinearity, optimizer, loss, backwards_nonlinearity='deriv',
+        w_init = 'xavier', rng = None):
+
+    rng = get_rng(rng)
+    return ManualBackpropNet(
+        layers = [
+            FeedbackAlignmentLayer.from_init(
+                n_in=n_in,
+                n_out=n_out,
+                nonlinearity=nonlinearity if i < len(layer_sizes)-2 else final_nonlinearity,
+                backwards_nonlinearity = backwards_nonlinearity,
+                w_init = w_init,
+                rng=rng
+                ) for i, (n_in, n_out) in enumerate(izip_equal(layer_sizes[:-1], layer_sizes[1:]))
+            ],
+        optimizer=optimizer,
+        loss=loss
+        )
\ No newline at end of file
diff --git a/plato/tools/misc/tdb_plotting.py b/plato/tools/misc/tdb_plotting.py
index 398a34a..0ab582f 100644
--- a/plato/tools/misc/tdb_plotting.py
+++ b/plato/tools/misc/tdb_plotting.py
@@ -54,8 +54,7 @@ def tdbplot(var, name = None, plot_type = None, draw_every=None, overwright_name
     # tdb_trace(var, name, callback=partial(set_plot_data_and_update, name=name, draw_every=draw_every), overwright_names=overwright_names)
     tdb_trace(var, name, overwrite_names=overwright_names)
 
-    CallbackCatcher.get_current().add_callback(plot_all_trace_variables)
-
+    CallbackCatcher.get_current().add_callback(partial(plot_all_trace_variables, draw_every=draw_every))
 
 
 @contextmanager
diff --git a/plato/tools/mlp/demo_mnist_mlp.py b/plato/tools/mlp/demo_mnist_mlp.py
index 5918a5c..6686771 100644
--- a/plato/tools/mlp/demo_mnist_mlp.py
+++ b/plato/tools/mlp/demo_mnist_mlp.py
@@ -83,15 +83,22 @@ def vis_callback(info, score):
     return info_score_pair_sequence
 
 
-X=demo_mnist_mlp.add_variant('mini-mnist', max_training_samples=1000, max_test_samples=1000, hidden_sizes=[100], n_epochs=100, visualize_params=True)
+demo_mnist_mlp.add_variant('full-batch', minibatch_size = 'full', n_epochs = 1000)
+demo_mnist_mlp.add_variant('deep', hidden_sizes=[500, 500, 500, 500])
 
-X.add_variant('full-batch', minibatch_size = 'full', n_epochs = 1000)
+# demo_mnist_mlp.get_variant('deep').run()
+print demo_mnist_mlp.get_variant('deep').get_latest_record().get_log()
 
-X.add_variant('L2-loss', cost='mse', onehot=True, learning_rate=0.01)
 
-demo_mnist_mlp.add_variant(hidden_sizes=[])
+# X=demo_mnist_mlp.add_variant('mini-mnist', max_training_samples=1000, max_test_samples=1000, hidden_sizes=[100], n_epochs=100, visualize_params=True)
+#
+# X.add_variant('full-batch', minibatch_size = 'full', n_epochs = 1000)
+#
+# X.add_variant('L2-loss', cost='mse', onehot=True, learning_rate=0.01)
+#
+# demo_mnist_mlp.add_variant(hidden_sizes=[])
 
 
-if __name__ == '__main__':
+# if __name__ == '__main__':
 
-    browse_experiments()
+    # browse_experiments()
diff --git a/plato/tools/mlp/manual_backprop_net.py b/plato/tools/mlp/manual_backprop_net.py
index 58aa72c..7bad6ae 100644
--- a/plato/tools/mlp/manual_backprop_net.py
+++ b/plato/tools/mlp/manual_backprop_net.py
@@ -95,10 +95,6 @@ def forward_pass_and_state(self, x):
         """
 
 
-    # def backward_pass_from_loss(self, state, loss):
-    #     grad = tt.grad(loss, wrt=state[-1])
-    #     return self.backward_pass_from_grad(state, grad)
-
     @abstractmethod
     def backward_pass(self, state, grad, cost):
         """

From 39dd54a12838c28b3aeda7d8d63b34f44b917eaa Mon Sep 17 00:00:00 2001
From: peter <peter.ed.oconnor@gmail.com>
Date: Fri, 28 Jul 2017 19:23:10 +0200
Subject: [PATCH 07/29] oook

---
 plato/tools/common/config.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/plato/tools/common/config.py b/plato/tools/common/config.py
index 160ec7d..b071c59 100644
--- a/plato/tools/common/config.py
+++ b/plato/tools/common/config.py
@@ -38,3 +38,4 @@ def hold_theano_optimizer(value):
     theano.config.optimizer = value
     yield
     theano.config.optimizer = old_val
+

From 9137f282019d245b4912d9280b427efb4cfe7d1b Mon Sep 17 00:00:00 2001
From: Peter O'Connor <peter.ed.oconnor@gmail.com>
Date: Wed, 11 Oct 2017 08:05:07 +0200
Subject: [PATCH 08/29] eeehhh

---
 plato/tools/common/basic.py             |  4 ++--
 plato/tools/common/online_predictors.py | 28 ++++++++++++++++++++++++-
 plato/tools/mlp/mlp.py                  |  4 ++--
 3 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/plato/tools/common/basic.py b/plato/tools/common/basic.py
index 2ead9e9..c9288ed 100644
--- a/plato/tools/common/basic.py
+++ b/plato/tools/common/basic.py
@@ -81,7 +81,7 @@ def running_mean_and_variance(data, decay = None, shape = None, elementwise=True
         var_new = s_new
     add_update(mean_last, mean_new)
     add_update(s_last, s_new)
-    return var_new
+    return mean_new, var_new
 
 
 @symbolic
@@ -93,4 +93,4 @@ def running_variance(data, decay=None, shape = None, elementwise=True, initial_v
     :param shape:
     :return:
     """
-    return running_mean_and_variance(data=data, decay=decay, shape=shape, elementwise=elementwise, initial_var = initial_value)
+    return running_mean_and_variance(data=data, decay=decay, shape=shape, elementwise=elementwise, initial_var = initial_value)[1]
diff --git a/plato/tools/common/online_predictors.py b/plato/tools/common/online_predictors.py
index 4bc6281..3302b94 100644
--- a/plato/tools/common/online_predictors.py
+++ b/plato/tools/common/online_predictors.py
@@ -1,4 +1,6 @@
 from abc import ABCMeta, abstractmethod
+from contextlib import contextmanager
+
 from plato.interfaces.decorators import symbolic_simple, symbolic_updater
 from plato.interfaces.interfaces import IParameterized
 from plato.tools.optimization.cost import get_named_cost_function
@@ -87,6 +89,25 @@ def parameters(self):
         return self._function.parameters + opt_params
 
 
+_LOCAL_LOSSES = None
+
+
+def declare_local_loss(loss):
+    if _LOCAL_LOSSES is not None:
+        _LOCAL_LOSSES.append(loss)
+
+
+@contextmanager
+def capture_local_losses():
+    global  _LOCAL_LOSSES
+    assert _LOCAL_LOSSES is None, "Local loss book already open"
+    _LOCAL_LOSSES = []
+    try:
+        yield _LOCAL_LOSSES
+    finally:
+        _LOCAL_LOSSES = None
+
+
 class CompiledSymbolicPredictor(IPredictor, IParameterized):
     """
     A Predictor containing the compiled methods for a SymbolicPredictor.
@@ -125,7 +146,12 @@ def __call__(self, x):
         raise NotImplementedError()
 
     def train(self, x, y, cost_fcn, optimizer, assert_all_params_optimized=False, regularization_cost = None):
-        cost = cost_fcn(self.train_call(x), y)
+        with capture_local_losses() as local_losses:
+            cost = cost_fcn(self.train_call(x), y)
+
+        if len(local_losses)>0:
+            cost = cost + sum(local_losses)
+
         if regularization_cost is not None:
             cost = cost + regularization_cost(self.parameters)
         if isinstance(optimizer, dict):
diff --git a/plato/tools/mlp/mlp.py b/plato/tools/mlp/mlp.py
index c97a2e6..2b86baa 100644
--- a/plato/tools/mlp/mlp.py
+++ b/plato/tools/mlp/mlp.py
@@ -28,8 +28,8 @@ def __call__(self, x):
         return x
 
     @symbolic
-    def get_layer_activations(self, x):
-        activations = []
+    def get_layer_activations(self, x, include_input = False):
+        activations = [x] if include_input else []
         for lay in self.layers:
             x = lay(x)
             activations.append(x)

From 8c1cebf40087f422d97bfbf6f1393b565d5264f1 Mon Sep 17 00:00:00 2001
From: Peter O'Connor <peter.ed.oconnor@gmail.com>
Date: Fri, 13 Oct 2017 16:24:10 +0200
Subject: [PATCH 09/29] BEFORE_CHAGNING_ENCODER

---
 plato/core.py                               | 277 +++++++++++---------
 plato/interfaces/helpers.py                 |  18 +-
 plato/interfaces/test_helpers.py            |  16 ++
 plato/test_core.py                          |  35 ++-
 plato/tools/common/config.py                |   1 -
 plato/tools/convnet/conv_specifiers.py      |  14 +-
 plato/tools/convnet/convnet.py              |   4 +-
 plato/tools/mlp/manual_backprop_net.py      |  34 ++-
 plato/tools/mlp/test_manual_backprop_net.py |  15 ++
 plato/tools/optimization/cost.py            |   6 +-
 10 files changed, 280 insertions(+), 140 deletions(-)

diff --git a/plato/core.py b/plato/core.py
index 42c8895..d25fbdb 100644
--- a/plato/core.py
+++ b/plato/core.py
@@ -63,6 +63,103 @@ def my_symbolic_function(x, y):
 Variable.idtype = property(lambda self: (self.ival.dtype if isinstance(self.ival, np.ndarray) else type(self.ival)))
 
 
+
+class IFormat(object):
+
+    @staticmethod
+    def check(data, f):
+        """
+        Assert that data is in correct format.  Otherwise, throw SymbolicFormatError.  f is the reference to the function
+        whose inputs/outputs/updates are being inspected.  f is passed in so that it can be used in the error message,
+        if any.
+        """
+
+class PassAnythingFormat(IFormat):
+
+    @staticmethod
+    def check(data, f):
+        pass
+
+
+class AnyReturnFormat(IFormat):
+
+    @staticmethod
+    def check(data, f):
+        pass
+
+
+class SingleOutputFormat(IFormat):
+
+    @staticmethod
+    def check(data, f):
+        if not _is_tensor(data):
+            raise SymbolicFormatError('Function %s was should have returned a tensor output, but instead returned: %s' % (f, data))
+
+
+class MultiOutputFormat(IFormat):
+
+    @staticmethod
+    def check(data, f):
+        if not _is_tuple_of_tensors(data):
+            raise SymbolicFormatError('Function %s was should have returned a tuple-of-tensors output, but instead returned: %s' % (f, data))
+
+
+class NoOutputFormat(IFormat):
+
+    @staticmethod
+    def check(data, f):
+        assert data is None, "Function %s should have returned no output, but it returned %s.  If your intention was to return updates, use add_update instead." % (f, data)
+
+
+class NoUpdatesFormat(IFormat):
+
+    @staticmethod
+    def check(data, f):
+        assert isinstance(data, list), "Updates should be in the form of a list.  Something is strange if this is not the case"
+        if len(data)!=0:
+            raise SymbolicFormatError("Function %s should have created no state updates, but it created updates: %s" % (f, data))
+
+
+class SomeUpdatesFormat(IFormat):
+
+    @staticmethod
+    def check(data, f):
+        if isinstance(data, list): "Updates should be in the form of a list.  Something is strange if this is not the case"
+        if len(data) == 0:
+            raise SymbolicFormatError("Function %s should have created state updates, but it failed to update any variables!" % (f, ))
+
+
+class NamedCollectionFormat(IFormat):
+
+    @staticmethod
+    def check(data, f):
+        if not _is_named_collection(data):
+            raise SymbolicFormatError("Data should be a named collection, in a dict<string:tensor> format.  Right now it looks like this: %s" % (data, ))
+
+
+class CollectionOfCollectionsOfTensorsFormat(IFormat):
+
+    @staticmethod
+    def check(data, f):
+        if not _is_tuple_of_tuples_of_tensors(data):
+            raise SymbolicFormatError("Data should be a collection of collections of tensors.  Right now it looks like this: %s" % (data, ))
+
+
+class ConstantFormat(IFormat):
+
+    @staticmethod
+    def check(data, f):
+        if not isinstance(data, (float, int, np.ndarray)):
+            raise SymbolicFormatError("Data should be a constant, numeric data (numpy or python float, etc).  Right now it looks like this: %s" % (data, ))
+
+
+class SymbolicFormatError(Exception):
+    pass
+
+
+
+
+
 def symbolic(fcn):
     """
     Use this to decorate a symbolic function with any return format (it will be detected automatically).
@@ -203,7 +300,7 @@ def __call__(self, *args, **kwargs):
         self.output_format.check(symbolic_return, self.fcn)
         return symbolic_return
 
-    def scan(self, **scan_kwargs):
+    def scan(self, *sequence_args, **scan_kwargs):
         """
         Apply a scan to this function.  For arguments, see thr
         :param scan_kwargs: See theano.scan doc: http://deeplearning.net/software/theano/library/scan.html#theano.scan
@@ -211,26 +308,69 @@ def scan(self, **scan_kwargs):
             [sequences[0], ... sequences[-1], outputs_info[0], ... outputs_info[-1], non_sequences[0], ... non_sequences[-1]]
         :return:
         """
-        outputs, updates = theano.scan(self._call_with_updates_returned, **scan_kwargs)
 
-        if self._had_to_add_dummies:
-            # See why this is necessary: https://groups.google.com/forum/#!topic/theano-users/F0-EeC0Lsl8
-            # Basically, we need to undo some evil that is done in theano's scan function.  See _call_with_updates_returned
-            outputs = outputs[:-2]
+        if len(sequence_args)>0:
+            assert 'sequences' not in scan_kwargs, 'You can either specify sequences as unnamed args or not'
+            scan_kwargs = scan_kwargs.copy()
+            scan_kwargs['sequences'] = sequence_args
+
+        outputs, updates = theano.scan(self._call_with_updates_returned, return_list = True, **scan_kwargs)
 
-        if len(self._trace_info)>0:
+
+        #
+        # if self._had_to_add_dummies:
+        #     # See why this is necessary: https://groups.google.com/forum/#!topic/theano-users/F0-EeC0Lsl8
+        #     # Basically, we need to undo some evil that is done in theano's scan function.  See _call_with_updates_returned
+        #     outputs = outputs[:-2]
+
+        if len(self._trace_info)>0:  # Peel off trace variables if any
             trace_outputs = outputs[-len(self._trace_info):]
             outputs = outputs[:-len(self._trace_info)]
             for (trace_name, (_, batch_in_scan, callback)), trace_output in izip_equal(self._trace_info.iteritems(), trace_outputs):
                 CaptureTraceVariables.CURRENT_CATCHER.add_trace(variable=trace_output if batch_in_scan else trace_output[-1], name=trace_name, batch_in_scan=batch_in_scan, callback=callback)
 
-        if self._single_output and isinstance(outputs, (list, tuple)):
+        if self._output_format is None:
+            outputs = None
+        elif self._output_format == 'single':
             assert len(outputs)==1, 'This should always be true, and you should call Peter if it is not.  +3163004422 seven'
             outputs, = outputs
+        else:
+            assert self._output_format == 'tuple'
+            outputs = outputs
+
+        # outputs = \
+        #     None if self._output_format is None else \
+
+
+        # if self._single_output and isinstance(outputs, (list, tuple)):
+        #     assert len(outputs)==1, 'This should always be true, and you should call Peter if it is not.  +3163004422 seven'
+        #     outputs, = outputs
         for (shared_var, new_val) in updates.items():
             add_update(shared_var, new_val)
         return outputs
 
+    def _call_with_updates_returned(self, *args, **kwargs):
+        with CaptureUpdates(swallow=True) as sc, CaptureTraceVariables(swallow=True) as traces:
+            outputs = self(*args, **kwargs)
+
+        # self._single_output = isinstance(outputs, Variable)
+        self._trace_info = traces.get_trace_variable_info()
+
+        # Due to trace variables, we will convert outputs to tuple.  We preserve original format here.
+        self._output_format = None if outputs is None else \
+            'single' if isinstance(outputs, Variable) else \
+            'tuple'
+
+        outputs = \
+            () if self._output_format is None else \
+            (outputs, ) if self._output_format =='single' else \
+            outputs
+
+        if len(traces)>0:
+            outputs = outputs + tuple(traces.values())
+
+        return outputs, OrderedDict(sc.get_updates())
+
     def eval(self, *args, **kwargs):
         """
         Compile and evaluate the function for the given inputs.
@@ -249,27 +389,6 @@ def __eq__(self, other):
                 return True
         return False
 
-    def _call_with_updates_returned(self, *args, **kwargs):
-        with CaptureUpdates(swallow=True) as sc, CaptureTraceVariables(swallow=True) as traces:
-            outputs = self(*args, **kwargs)
-
-        self._single_output = isinstance(outputs, Variable)
-        self._trace_info = traces.get_trace_variable_info()
-
-        if self._single_output and len(traces)>0:
-            outputs = (outputs, )
-        elif outputs is None:
-            outputs = (tt.zeros(), )
-
-        if len(traces)>0:
-            outputs = outputs + tuple(traces.values())
-
-        self._had_to_add_dummies = isinstance(outputs, (list, tuple)) and len(outputs)==1 # Necessary evil to force theano.scan to return collection even if length is 1.
-        if self._had_to_add_dummies:
-            outputs = outputs + type(outputs)([tt.zeros(()), tt.zeros(())])
-
-        return outputs, OrderedDict(sc.get_updates())
-
     def to_format(self, format_decorator):
 
         @format_decorator
@@ -322,15 +441,8 @@ def locals(self):
         return self._captured_locals
 
 
-class IFormat(object):
-
-    @staticmethod
-    def check(data, f):
-        """
-        Assert that data is in correct format.  Otherwise, throw SymbolicFormatError.  f is the reference to the function
-        whose inputs/outputs/updates are being inspected.  f is passed in so that it can be used in the error message,
-        if any.
-        """
+# Need to do this here instead of decorating because _SymbolicFunctionWrapper is not defined yet at decoration-time.
+_SymbolicFunctionWrapper.scan = symbolic(_SymbolicFunctionWrapper.scan)
 
 
 def _detect_format(data):
@@ -376,89 +488,6 @@ def convert_formats(data, src_format, dest_format):
         raise SymbolicFormatError('No way to convert data from %s to %s' % (src_format, dest_format))
 
 
-class PassAnythingFormat(IFormat):
-
-    @staticmethod
-    def check(data, f):
-        pass
-
-
-class AnyReturnFormat(IFormat):
-
-    @staticmethod
-    def check(data, f):
-        pass
-
-
-class SingleOutputFormat(IFormat):
-
-    @staticmethod
-    def check(data, f):
-        if not _is_tensor(data):
-            raise SymbolicFormatError('Function %s was should have returned a tensor output, but instead returned: %s' % (f, data))
-
-
-class MultiOutputFormat(IFormat):
-
-    @staticmethod
-    def check(data, f):
-        if not _is_tuple_of_tensors(data):
-            raise SymbolicFormatError('Function %s was should have returned a tuple-of-tensors output, but instead returned: %s' % (f, data))
-
-
-class NoOutputFormat(IFormat):
-
-    @staticmethod
-    def check(data, f):
-        assert data is None, "Function %s should have returned no output, but it returned %s.  If your intention was to return updates, use add_update instead." % (f, data)
-
-
-class NoUpdatesFormat(IFormat):
-
-    @staticmethod
-    def check(data, f):
-        assert isinstance(data, list), "Updates should be in the form of a list.  Something is strange if this is not the case"
-        if len(data)!=0:
-            raise SymbolicFormatError("Function %s should have created no state updates, but it created updates: %s" % (f, data))
-
-
-class SomeUpdatesFormat(IFormat):
-
-    @staticmethod
-    def check(data, f):
-        if isinstance(data, list): "Updates should be in the form of a list.  Something is strange if this is not the case"
-        if len(data) == 0:
-            raise SymbolicFormatError("Function %s should have created state updates, but it failed to update any variables!" % (f, ))
-
-
-class NamedCollectionFormat(IFormat):
-
-    @staticmethod
-    def check(data, f):
-        if not _is_named_collection(data):
-            raise SymbolicFormatError("Data should be a named collection, in a dict<string:tensor> format.  Right now it looks like this: %s" % (data, ))
-
-
-class CollectionOfCollectionsOfTensorsFormat(IFormat):
-
-    @staticmethod
-    def check(data, f):
-        if not _is_tuple_of_tuples_of_tensors(data):
-            raise SymbolicFormatError("Data should be a collection of collections of tensors.  Right now it looks like this: %s" % (data, ))
-
-
-class ConstantFormat(IFormat):
-
-    @staticmethod
-    def check(data, f):
-        if not isinstance(data, (float, int, np.ndarray)):
-            raise SymbolicFormatError("Data should be a constant, numeric data (numpy or python float, etc).  Right now it looks like this: %s" % (data, ))
-
-
-class SymbolicFormatError(Exception):
-    pass
-
-
 
 def _is_tensor(arg):
     return isinstance(arg, (Variable, np.ndarray))
@@ -594,8 +623,10 @@ def __call__(self, *args, **kwargs):
             for cb in cc.get_callbacks():
                 self._callbacks.append(cb)
 
-            if outputs is None:
+            self.outputs_none = outputs is None
+            if self.outputs_none:
                 outputs = ()
+
             PLATO_LOGGER.info('Done.')
             updates = sc.get_updates()
 
@@ -674,7 +705,7 @@ def __call__(self, *args, **kwargs):
         for c in self._callbacks:
             c()
 
-        return true_out
+        return None if self.outputs_none else true_out
 
     def reset(self):
         assert self.resettable, "If you want to reset the state of your compiled function, you must compile with f.compile(resettable=True)"
diff --git a/plato/interfaces/helpers.py b/plato/interfaces/helpers.py
index 47ec69e..bfd7e94 100644
--- a/plato/interfaces/helpers.py
+++ b/plato/interfaces/helpers.py
@@ -1,5 +1,5 @@
 import numpy as np
-from plato.core import symbolic_simple, add_update, create_shared_variable, symbolic
+from plato.core import symbolic_simple, add_update, create_shared_variable, symbolic, CaptureUpdates
 from plato.interfaces.interfaces import IParameterized
 import theano
 from theano.compile.sharedvalue import SharedVariable
@@ -10,6 +10,7 @@
 import theano.tensor as tt
 from theano.tensor.sharedvar import TensorSharedVariable
 from theano.tensor.var import TensorVariable
+from theano.gof.graph import Variable
 
 __author__ = 'peter'
 
@@ -195,7 +196,7 @@ def __call__(self, x):
         return x - running_mean
 
 
-def batchify_function(fcn, batch_size):
+def batchify_function(fcn, batch_size, **scan_kwargs):
     """
     Given a symbolic function, transform it so that computes its input in a sequence of minibatches, instead of in
     one go.  This can be useful when:
@@ -214,10 +215,17 @@ def batchify_function(fcn, batch_size):
     def batch_function(*args):
         start_ixs = tt.arange(0, args[0].shape[0], batch_size)
         @symbolic
-        def process_batch(start_ix, end_ix):
+        def process_batch(start_ix, end_ix, *args):
             return fcn(*[arg[start_ix:end_ix] for arg in args])
-        out = process_batch.scan(sequences = [start_ixs, start_ixs+batch_size])
-        return out.reshape((-1, )+tuple(out.shape[i] for i in xrange(2, out.ndim)), ndim=out.ndim-1)
+
+        out = process_batch.scan(sequences = [start_ixs, start_ixs+batch_size], non_sequences = args, **scan_kwargs)
+        # out = theano.scan(process_batch, sequences = [start_ixs, start_ixs+batch_size])
+        if out is None:
+            return None
+        elif isinstance(out, Variable):
+            return out.reshape((-1, )+tuple(out.shape[i] for i in xrange(2, out.ndim)), ndim=out.ndim-1)
+        else:
+            return out.__class__(o.reshape((-1, )+tuple(o.shape[i] for i in xrange(2, p.ndim)), ndim=o.ndim-1) for o in out)
     return batch_function
 
 
diff --git a/plato/interfaces/test_helpers.py b/plato/interfaces/test_helpers.py
index 95f2a05..f9661ac 100644
--- a/plato/interfaces/test_helpers.py
+++ b/plato/interfaces/test_helpers.py
@@ -43,6 +43,21 @@ def add_them(a, b):
     assert np.allclose(out, arr_a+arr_b)
 
 
+def test_batch_without_return():
+
+    state = create_shared_variable(np.zeros(2))
+
+    @symbolic
+    def do_something_internal(a, b):
+        new_state = state+ a*b
+        add_update(state, new_state)
+        # return new_state
+
+    out = batchify_function(do_something_internal, batch_size=2).compile()(np.arange(6).astype(float), np.arange(1,7).astype(float))
+    assert out is None
+    assert np.array_equal(state.get_value(), [0*1+2*3+4*5, 1*2+3*4+5*6])
+
+
 def test_compute_in_with_state():
 
     @symbolic
@@ -105,3 +120,4 @@ def accumulate(x):
     test_compute_in_with_state()
     test_on_first_pass()
     test_reshaping_shared_variable()
+    test_batch_without_return()
diff --git a/plato/test_core.py b/plato/test_core.py
index f140be9..098f502 100644
--- a/plato/test_core.py
+++ b/plato/test_core.py
@@ -1,7 +1,7 @@
 from abc import abstractmethod
 
 from artemis.general.hashing import compute_fixed_hash, fixed_hash_eq
-from plato.interfaces.helpers import create_shared_variable
+from plato.interfaces.helpers import create_shared_variable, shared_like
 from plato.tools.common.config import hold_float_precision
 from pytest import raises
 from plato.core import symbolic_simple, symbolic_updater, SymbolicFormatError, \
@@ -607,6 +607,37 @@ def my_cumsum(x):
     assert np.array_equal(get_tdb_traces()['x_in_loop_catch_all'], np.arange(4)**3)
 
 
+def test_easy_scan_syntax():
+
+    @symbolic
+    def accumulator(v, shape):
+        accum = create_shared_variable(np.zeros(shape))
+        new_accum = accum + v
+        add_update(accum, new_accum)
+        return new_accum
+
+    x = np.random.randn(5, 3)
+    f = accumulator.partial(shape=x.shape[1:]).scan.compile()
+
+    assert np.allclose(f(x), np.cumsum(x, axis=0))
+
+
+def test_scan_no_return():
+
+    state = create_shared_variable(np.zeros(()))
+
+    @symbolic
+    def do_something_internal(a, b):
+        new_state = state+ a*b
+        add_update(state, new_state)
+
+    out = do_something_internal.scan.compile()(np.arange(6).astype(float), np.arange(1,7).astype(float))
+
+    assert out is None
+    assert np.array_equal(state.get_value(), np.arange(6).dot(np.arange(1, 7)))
+
+
+
 if __name__ == '__main__':
     test_ival_ishape()
     test_catch_sneaky_updates()
@@ -630,3 +661,5 @@ def my_cumsum(x):
     test_shared_input()
     test_function_reset()
     test_trace_var_in_scan()
+    test_easy_scan_syntax()
+    test_scan_no_return()
\ No newline at end of file
diff --git a/plato/tools/common/config.py b/plato/tools/common/config.py
index b071c59..160ec7d 100644
--- a/plato/tools/common/config.py
+++ b/plato/tools/common/config.py
@@ -38,4 +38,3 @@ def hold_theano_optimizer(value):
     theano.config.optimizer = value
     yield
     theano.config.optimizer = old_val
-
diff --git a/plato/tools/convnet/conv_specifiers.py b/plato/tools/convnet/conv_specifiers.py
index ecba4b3..158beee 100644
--- a/plato/tools/convnet/conv_specifiers.py
+++ b/plato/tools/convnet/conv_specifiers.py
@@ -1,6 +1,8 @@
 from artemis.fileman.primitive_specifiers import PrimativeSpecifier
+from artemis.general.numpy_helpers import get_rng
 from artemis.general.should_be_builtins import bad_value
-
+from artemis.ml.tools.neuralnets import initialize_weight_matrix
+import numpy as np
 __author__ = 'peter'
 
 
@@ -51,6 +53,16 @@ def __init__(self, w, b, mode):
         self.b=b
         self.mode = mode
 
+    @staticmethod
+    def from_init(k_shape, mode, mag='xavier', use_biases=True, rng=None):
+        n_out_maps, n_in_maps, k_size_y, k_size_x = k_shape
+        rng = get_rng(rng)
+        if mag == 'xavier':
+            fanin, fanout = n_in_maps*k_size_x*k_size_y, n_out_maps*k_size_x*k_size_y
+            w = 1./np.sqrt(fanin+fanout) * rng.randn(*k_shape)
+        b = np.zeros(n_out_maps) if use_biases else False
+        return ConvolverSpec(w, b, mode)
+
     def shape_transfer(self, (n_samples, n_maps, size_y, size_x)):
         return (n_samples, self.w.shape[0])+{
             'same': (size_y, size_x),
diff --git a/plato/tools/convnet/convnet.py b/plato/tools/convnet/convnet.py
index 7537aa2..94b7304 100644
--- a/plato/tools/convnet/convnet.py
+++ b/plato/tools/convnet/convnet.py
@@ -53,7 +53,7 @@ def from_spec(cls, spec):
         return ConvLayer(
             w=spec.w,
             b=spec.b,
-            border_mode= {'full': 0, 'same': 1, 'valid': 0}[spec.mode] if spec.mode in ('full', 'same', 'valid') else spec.mode,
+            border_mode= {'full': 0, 'same': 'half', 'valid': 0}[spec.mode] if spec.mode in ('full', 'same', 'valid') else spec.mode,
             filter_flip=False
             )
 
@@ -267,7 +267,7 @@ def specifier_to_layer(spec, force_shared_parameters=True, rng = None):
             w=spec.w,
             b=spec.b,
             force_shared_parameters=force_shared_parameters,
-            border_mode= {'full': 0, 'same': 1, 'valid': 0}[spec.mode] if spec.mode in ('full', 'same', 'valid') else spec.mode,
+            border_mode= {'full': 0, 'same': 'half', 'valid': 0}[spec.mode] if spec.mode in ('full', 'same', 'valid') else spec.mode,
             filter_flip=False
             ),
         NonlinearitySpec: lambda: Nonlinearity(spec.func),
diff --git a/plato/tools/mlp/manual_backprop_net.py b/plato/tools/mlp/manual_backprop_net.py
index 7bad6ae..145e2b8 100644
--- a/plato/tools/mlp/manual_backprop_net.py
+++ b/plato/tools/mlp/manual_backprop_net.py
@@ -1,4 +1,5 @@
 from abc import abstractmethod
+from collections import OrderedDict
 
 import numpy as np
 from artemis.general.should_be_builtins import izip_equal
@@ -13,14 +14,20 @@
 
 
 class ManualBackpropNet(ISymbolicPredictor):
-
+    """
+    A sequential (chain) network where you can insert layers that do backprop manually.
+    """
     def __init__(self, layers, optimizer, loss, prediction_minibatch_size=None, pass_loss = True):
         """
         :param layrs:
         :param optimizer:
         :param loss:
         """
-        self.layers = layers
+        if isinstance(layers, OrderedDict):
+            self.layer_names, self.layers = zip(*layers.items())
+        else:
+            self.layer_names = range(len(layers))
+            self.layers = layers
         self.optimizer = optimizer
         self.pass_loss = pass_loss
         self.loss = get_named_cost_function(loss) if isinstance(loss, basestring) else loss
@@ -35,7 +42,7 @@ def predict(self, x):
 
     def _predict_in_single_pass(self, x):
         for i, layer in enumerate(self.layers):
-            x = layer.forward_pass(x)
+            x = layer(x)
         return x
 
     @symbolic
@@ -46,8 +53,14 @@ def _predict_minibatch(self, start, end, x):
     def train(self, x, y):
         states = {}
         for layer in self.layers:
-            x, layer_state = layer.forward_pass_and_state(x)
+            if isinstance(layer, IManualBackpropLayer):
+                x, layer_state = layer.forward_pass_and_state(x)
+            else:
+                out = layer(x)
+                layer_state = (x, out)
+                x = out
             states[layer]=layer_state
+
         layerwise_param_grad_pairs = []
         loss = self.loss(x, y)
         if self.pass_loss:
@@ -56,7 +69,12 @@ def train(self, x, y):
             grad = tt.grad(loss, wrt=x)
             loss = None
         for layer in self.layers[::-1]:
-            grad, param_grads = layer.backward_pass(state=states[layer], grad=grad, cost = loss)
+            if isinstance(layer, IManualBackpropLayer):
+                grad, param_grads = layer.backward_pass(state=states[layer], grad=grad, cost = loss)
+            else:
+                x, y = states[layer]
+                grads = tt.grad(cost=loss, wrt=[x]+list(layer.parameters), known_grads={y: grad} if grad is not None else None)
+                grad, param_grads = grads[0], grads[1:]
             loss = None
             layerwise_param_grad_pairs.append(list(izip_equal(layer.parameters, param_grads)))
         if isinstance(self.optimizer, IGradientOptimizer):
@@ -83,6 +101,9 @@ def forward_pass(self, x):
         out, _ = self.forward_pass_and_state(x)
         return out
 
+    def __call__(self, *args):
+        return self.forward_pass(*args)
+
     @abstractmethod
     def forward_pass_and_state(self, x):
         """
@@ -93,7 +114,7 @@ def forward_pass_and_state(self, x):
                 state is a list of state-variables to be passed into the backward pass.
                 Importantly, they must be in order (so that the last element of state is the one used to compute the gradient)
         """
-
+        raise NotImplementedError()
 
     @abstractmethod
     def backward_pass(self, state, grad, cost):
@@ -102,6 +123,7 @@ def backward_pass(self, state, grad, cost):
         :param grad: The incoming gradient
         :return: The outgoing gradient
         """
+        raise NotImplementedError()
 
 
 class ExactBackpropLayer(IManualBackpropLayer):
diff --git a/plato/tools/mlp/test_manual_backprop_net.py b/plato/tools/mlp/test_manual_backprop_net.py
index d606a3d..6cc5a19 100644
--- a/plato/tools/mlp/test_manual_backprop_net.py
+++ b/plato/tools/mlp/test_manual_backprop_net.py
@@ -25,30 +25,45 @@ def test_exact_manual_backprop_net():
         optimizer = GradientDescent(0.1),
         loss = 'softmax-xe'
         )
+    stick_shifted_by_robot = ManualBackpropNet(
+        layers = MultiLayerPerceptron.from_weights(weights=ws, hidden_activations='relu', output_activation='linear').layers,
+        optimizer = GradientDescent(0.1),
+        loss = 'softmax-xe'
+        )
 
     # Check forward passes match
     fp_auto = auto_mlp.predict.compile()
     fp_stick = stick_mlp.predict.compile()
+    fp_robot = stick_shifted_by_robot.predict.compile()
+
     out_auto = fp_auto(x)
     out_stick = fp_stick(x)
+    out_robot = fp_robot(x)
     assert np.allclose(out_auto, out_stick)
+    assert np.allclose(out_auto, out_robot)
 
     # 1 Iteration of training
     ft_auto = auto_mlp.train.compile()
     ft_stick = stick_mlp.train.compile()
+    ft_robot = stick_shifted_by_robot.train.compile()
     ft_auto(x, y)
     ft_stick(x, y)
+    ft_robot(x, y)
 
     # Check parameter changes match
     dw0_auto = auto_mlp._function.layers[0].linear_transform.w.get_value() - ws[0]
     dw0_stick = stick_mlp.layers[0].linear_transform.w.get_value() - ws[0]
+    dw0_robot = stick_shifted_by_robot.layers[0].linear_transform.w.get_value() - ws[0]
     assert np.allclose(dw0_auto, dw0_stick)
+    assert np.allclose(dw0_auto, dw0_robot)
 
     # Check outputs match
     new_out_auto = fp_auto(x)
     new_out_stick = fp_stick(x)
+    new_out_robot = fp_robot(x)
     assert np.allclose(new_out_auto, new_out_stick)
     assert not np.allclose(new_out_stick, out_auto)
+    assert np.allclose(new_out_auto, new_out_robot)
 
 
 if __name__ == '__main__':
diff --git a/plato/tools/optimization/cost.py b/plato/tools/optimization/cost.py
index ef23e2b..32a5eac 100644
--- a/plato/tools/optimization/cost.py
+++ b/plato/tools/optimization/cost.py
@@ -61,7 +61,11 @@ def negative_log_likelihood_dangerous(actual, target):
 
 @symbolic_simple
 def mean_squared_error(actual, target):
-    return tt.mean(tt.sum((actual-target)**2, axis = 1), axis = 0)
+    if actual.ndim==2:
+        return tt.mean(tt.sum((actual-target)**2, axis = 1), axis = 0)
+    else:
+        return tt.mean(tt.sum((actual.flatten(2)-target.flatten(2))**2, axis = 1), axis = 0)
+
 
 
 @symbolic_simple

From d49771427e7aa9c28e409628f3df7d0843f64a3c Mon Sep 17 00:00:00 2001
From: peter <peter.ed.oconnor@gmail.com>
Date: Tue, 17 Oct 2017 16:12:30 +0200
Subject: [PATCH 10/29] messin with ManualBackpropNet

---
 plato/core.py                               | 277 +++++++++++---------
 plato/interfaces/helpers.py                 |  18 +-
 plato/interfaces/test_helpers.py            |  16 ++
 plato/test_core.py                          |  35 ++-
 plato/tools/common/basic.py                 |   4 +-
 plato/tools/common/config.py                |   1 -
 plato/tools/common/online_predictors.py     |  28 +-
 plato/tools/convnet/conv_specifiers.py      |  14 +-
 plato/tools/convnet/convnet.py              |   4 +-
 plato/tools/mlp/manual_backprop_net.py      |  74 ++++--
 plato/tools/mlp/mlp.py                      |   4 +-
 plato/tools/mlp/test_manual_backprop_net.py |  15 ++
 plato/tools/optimization/cost.py            |   6 +-
 13 files changed, 341 insertions(+), 155 deletions(-)

diff --git a/plato/core.py b/plato/core.py
index 42c8895..d25fbdb 100644
--- a/plato/core.py
+++ b/plato/core.py
@@ -63,6 +63,103 @@ def my_symbolic_function(x, y):
 Variable.idtype = property(lambda self: (self.ival.dtype if isinstance(self.ival, np.ndarray) else type(self.ival)))
 
 
+
+class IFormat(object):
+
+    @staticmethod
+    def check(data, f):
+        """
+        Assert that data is in correct format.  Otherwise, throw SymbolicFormatError.  f is the reference to the function
+        whose inputs/outputs/updates are being inspected.  f is passed in so that it can be used in the error message,
+        if any.
+        """
+
+class PassAnythingFormat(IFormat):
+
+    @staticmethod
+    def check(data, f):
+        pass
+
+
+class AnyReturnFormat(IFormat):
+
+    @staticmethod
+    def check(data, f):
+        pass
+
+
+class SingleOutputFormat(IFormat):
+
+    @staticmethod
+    def check(data, f):
+        if not _is_tensor(data):
+            raise SymbolicFormatError('Function %s was should have returned a tensor output, but instead returned: %s' % (f, data))
+
+
+class MultiOutputFormat(IFormat):
+
+    @staticmethod
+    def check(data, f):
+        if not _is_tuple_of_tensors(data):
+            raise SymbolicFormatError('Function %s was should have returned a tuple-of-tensors output, but instead returned: %s' % (f, data))
+
+
+class NoOutputFormat(IFormat):
+
+    @staticmethod
+    def check(data, f):
+        assert data is None, "Function %s should have returned no output, but it returned %s.  If your intention was to return updates, use add_update instead." % (f, data)
+
+
+class NoUpdatesFormat(IFormat):
+
+    @staticmethod
+    def check(data, f):
+        assert isinstance(data, list), "Updates should be in the form of a list.  Something is strange if this is not the case"
+        if len(data)!=0:
+            raise SymbolicFormatError("Function %s should have created no state updates, but it created updates: %s" % (f, data))
+
+
+class SomeUpdatesFormat(IFormat):
+
+    @staticmethod
+    def check(data, f):
+        if isinstance(data, list): "Updates should be in the form of a list.  Something is strange if this is not the case"
+        if len(data) == 0:
+            raise SymbolicFormatError("Function %s should have created state updates, but it failed to update any variables!" % (f, ))
+
+
+class NamedCollectionFormat(IFormat):
+
+    @staticmethod
+    def check(data, f):
+        if not _is_named_collection(data):
+            raise SymbolicFormatError("Data should be a named collection, in a dict<string:tensor> format.  Right now it looks like this: %s" % (data, ))
+
+
+class CollectionOfCollectionsOfTensorsFormat(IFormat):
+
+    @staticmethod
+    def check(data, f):
+        if not _is_tuple_of_tuples_of_tensors(data):
+            raise SymbolicFormatError("Data should be a collection of collections of tensors.  Right now it looks like this: %s" % (data, ))
+
+
+class ConstantFormat(IFormat):
+
+    @staticmethod
+    def check(data, f):
+        if not isinstance(data, (float, int, np.ndarray)):
+            raise SymbolicFormatError("Data should be a constant, numeric data (numpy or python float, etc).  Right now it looks like this: %s" % (data, ))
+
+
+class SymbolicFormatError(Exception):
+    pass
+
+
+
+
+
 def symbolic(fcn):
     """
     Use this to decorate a symbolic function with any return format (it will be detected automatically).
@@ -203,7 +300,7 @@ def __call__(self, *args, **kwargs):
         self.output_format.check(symbolic_return, self.fcn)
         return symbolic_return
 
-    def scan(self, **scan_kwargs):
+    def scan(self, *sequence_args, **scan_kwargs):
         """
         Apply a scan to this function.  For arguments, see thr
         :param scan_kwargs: See theano.scan doc: http://deeplearning.net/software/theano/library/scan.html#theano.scan
@@ -211,26 +308,69 @@ def scan(self, **scan_kwargs):
             [sequences[0], ... sequences[-1], outputs_info[0], ... outputs_info[-1], non_sequences[0], ... non_sequences[-1]]
         :return:
         """
-        outputs, updates = theano.scan(self._call_with_updates_returned, **scan_kwargs)
 
-        if self._had_to_add_dummies:
-            # See why this is necessary: https://groups.google.com/forum/#!topic/theano-users/F0-EeC0Lsl8
-            # Basically, we need to undo some evil that is done in theano's scan function.  See _call_with_updates_returned
-            outputs = outputs[:-2]
+        if len(sequence_args)>0:
+            assert 'sequences' not in scan_kwargs, 'You can either specify sequences as unnamed args or not'
+            scan_kwargs = scan_kwargs.copy()
+            scan_kwargs['sequences'] = sequence_args
+
+        outputs, updates = theano.scan(self._call_with_updates_returned, return_list = True, **scan_kwargs)
 
-        if len(self._trace_info)>0:
+
+        #
+        # if self._had_to_add_dummies:
+        #     # See why this is necessary: https://groups.google.com/forum/#!topic/theano-users/F0-EeC0Lsl8
+        #     # Basically, we need to undo some evil that is done in theano's scan function.  See _call_with_updates_returned
+        #     outputs = outputs[:-2]
+
+        if len(self._trace_info)>0:  # Peel off trace variables if any
             trace_outputs = outputs[-len(self._trace_info):]
             outputs = outputs[:-len(self._trace_info)]
             for (trace_name, (_, batch_in_scan, callback)), trace_output in izip_equal(self._trace_info.iteritems(), trace_outputs):
                 CaptureTraceVariables.CURRENT_CATCHER.add_trace(variable=trace_output if batch_in_scan else trace_output[-1], name=trace_name, batch_in_scan=batch_in_scan, callback=callback)
 
-        if self._single_output and isinstance(outputs, (list, tuple)):
+        if self._output_format is None:
+            outputs = None
+        elif self._output_format == 'single':
             assert len(outputs)==1, 'This should always be true, and you should call Peter if it is not.  +3163004422 seven'
             outputs, = outputs
+        else:
+            assert self._output_format == 'tuple'
+            outputs = outputs
+
+        # outputs = \
+        #     None if self._output_format is None else \
+
+
+        # if self._single_output and isinstance(outputs, (list, tuple)):
+        #     assert len(outputs)==1, 'This should always be true, and you should call Peter if it is not.  +3163004422 seven'
+        #     outputs, = outputs
         for (shared_var, new_val) in updates.items():
             add_update(shared_var, new_val)
         return outputs
 
+    def _call_with_updates_returned(self, *args, **kwargs):
+        with CaptureUpdates(swallow=True) as sc, CaptureTraceVariables(swallow=True) as traces:
+            outputs = self(*args, **kwargs)
+
+        # self._single_output = isinstance(outputs, Variable)
+        self._trace_info = traces.get_trace_variable_info()
+
+        # Due to trace variables, we will convert outputs to tuple.  We preserve original format here.
+        self._output_format = None if outputs is None else \
+            'single' if isinstance(outputs, Variable) else \
+            'tuple'
+
+        outputs = \
+            () if self._output_format is None else \
+            (outputs, ) if self._output_format =='single' else \
+            outputs
+
+        if len(traces)>0:
+            outputs = outputs + tuple(traces.values())
+
+        return outputs, OrderedDict(sc.get_updates())
+
     def eval(self, *args, **kwargs):
         """
         Compile and evaluate the function for the given inputs.
@@ -249,27 +389,6 @@ def __eq__(self, other):
                 return True
         return False
 
-    def _call_with_updates_returned(self, *args, **kwargs):
-        with CaptureUpdates(swallow=True) as sc, CaptureTraceVariables(swallow=True) as traces:
-            outputs = self(*args, **kwargs)
-
-        self._single_output = isinstance(outputs, Variable)
-        self._trace_info = traces.get_trace_variable_info()
-
-        if self._single_output and len(traces)>0:
-            outputs = (outputs, )
-        elif outputs is None:
-            outputs = (tt.zeros(), )
-
-        if len(traces)>0:
-            outputs = outputs + tuple(traces.values())
-
-        self._had_to_add_dummies = isinstance(outputs, (list, tuple)) and len(outputs)==1 # Necessary evil to force theano.scan to return collection even if length is 1.
-        if self._had_to_add_dummies:
-            outputs = outputs + type(outputs)([tt.zeros(()), tt.zeros(())])
-
-        return outputs, OrderedDict(sc.get_updates())
-
     def to_format(self, format_decorator):
 
         @format_decorator
@@ -322,15 +441,8 @@ def locals(self):
         return self._captured_locals
 
 
-class IFormat(object):
-
-    @staticmethod
-    def check(data, f):
-        """
-        Assert that data is in correct format.  Otherwise, throw SymbolicFormatError.  f is the reference to the function
-        whose inputs/outputs/updates are being inspected.  f is passed in so that it can be used in the error message,
-        if any.
-        """
+# Need to do this here instead of decorating because _SymbolicFunctionWrapper is not defined yet at decoration-time.
+_SymbolicFunctionWrapper.scan = symbolic(_SymbolicFunctionWrapper.scan)
 
 
 def _detect_format(data):
@@ -376,89 +488,6 @@ def convert_formats(data, src_format, dest_format):
         raise SymbolicFormatError('No way to convert data from %s to %s' % (src_format, dest_format))
 
 
-class PassAnythingFormat(IFormat):
-
-    @staticmethod
-    def check(data, f):
-        pass
-
-
-class AnyReturnFormat(IFormat):
-
-    @staticmethod
-    def check(data, f):
-        pass
-
-
-class SingleOutputFormat(IFormat):
-
-    @staticmethod
-    def check(data, f):
-        if not _is_tensor(data):
-            raise SymbolicFormatError('Function %s was should have returned a tensor output, but instead returned: %s' % (f, data))
-
-
-class MultiOutputFormat(IFormat):
-
-    @staticmethod
-    def check(data, f):
-        if not _is_tuple_of_tensors(data):
-            raise SymbolicFormatError('Function %s was should have returned a tuple-of-tensors output, but instead returned: %s' % (f, data))
-
-
-class NoOutputFormat(IFormat):
-
-    @staticmethod
-    def check(data, f):
-        assert data is None, "Function %s should have returned no output, but it returned %s.  If your intention was to return updates, use add_update instead." % (f, data)
-
-
-class NoUpdatesFormat(IFormat):
-
-    @staticmethod
-    def check(data, f):
-        assert isinstance(data, list), "Updates should be in the form of a list.  Something is strange if this is not the case"
-        if len(data)!=0:
-            raise SymbolicFormatError("Function %s should have created no state updates, but it created updates: %s" % (f, data))
-
-
-class SomeUpdatesFormat(IFormat):
-
-    @staticmethod
-    def check(data, f):
-        if isinstance(data, list): "Updates should be in the form of a list.  Something is strange if this is not the case"
-        if len(data) == 0:
-            raise SymbolicFormatError("Function %s should have created state updates, but it failed to update any variables!" % (f, ))
-
-
-class NamedCollectionFormat(IFormat):
-
-    @staticmethod
-    def check(data, f):
-        if not _is_named_collection(data):
-            raise SymbolicFormatError("Data should be a named collection, in a dict<string:tensor> format.  Right now it looks like this: %s" % (data, ))
-
-
-class CollectionOfCollectionsOfTensorsFormat(IFormat):
-
-    @staticmethod
-    def check(data, f):
-        if not _is_tuple_of_tuples_of_tensors(data):
-            raise SymbolicFormatError("Data should be a collection of collections of tensors.  Right now it looks like this: %s" % (data, ))
-
-
-class ConstantFormat(IFormat):
-
-    @staticmethod
-    def check(data, f):
-        if not isinstance(data, (float, int, np.ndarray)):
-            raise SymbolicFormatError("Data should be a constant, numeric data (numpy or python float, etc).  Right now it looks like this: %s" % (data, ))
-
-
-class SymbolicFormatError(Exception):
-    pass
-
-
 
 def _is_tensor(arg):
     return isinstance(arg, (Variable, np.ndarray))
@@ -594,8 +623,10 @@ def __call__(self, *args, **kwargs):
             for cb in cc.get_callbacks():
                 self._callbacks.append(cb)
 
-            if outputs is None:
+            self.outputs_none = outputs is None
+            if self.outputs_none:
                 outputs = ()
+
             PLATO_LOGGER.info('Done.')
             updates = sc.get_updates()
 
@@ -674,7 +705,7 @@ def __call__(self, *args, **kwargs):
         for c in self._callbacks:
             c()
 
-        return true_out
+        return None if self.outputs_none else true_out
 
     def reset(self):
         assert self.resettable, "If you want to reset the state of your compiled function, you must compile with f.compile(resettable=True)"
diff --git a/plato/interfaces/helpers.py b/plato/interfaces/helpers.py
index 47ec69e..bfd7e94 100644
--- a/plato/interfaces/helpers.py
+++ b/plato/interfaces/helpers.py
@@ -1,5 +1,5 @@
 import numpy as np
-from plato.core import symbolic_simple, add_update, create_shared_variable, symbolic
+from plato.core import symbolic_simple, add_update, create_shared_variable, symbolic, CaptureUpdates
 from plato.interfaces.interfaces import IParameterized
 import theano
 from theano.compile.sharedvalue import SharedVariable
@@ -10,6 +10,7 @@
 import theano.tensor as tt
 from theano.tensor.sharedvar import TensorSharedVariable
 from theano.tensor.var import TensorVariable
+from theano.gof.graph import Variable
 
 __author__ = 'peter'
 
@@ -195,7 +196,7 @@ def __call__(self, x):
         return x - running_mean
 
 
-def batchify_function(fcn, batch_size):
+def batchify_function(fcn, batch_size, **scan_kwargs):
     """
     Given a symbolic function, transform it so that computes its input in a sequence of minibatches, instead of in
     one go.  This can be useful when:
@@ -214,10 +215,17 @@ def batchify_function(fcn, batch_size):
     def batch_function(*args):
         start_ixs = tt.arange(0, args[0].shape[0], batch_size)
         @symbolic
-        def process_batch(start_ix, end_ix):
+        def process_batch(start_ix, end_ix, *args):
             return fcn(*[arg[start_ix:end_ix] for arg in args])
-        out = process_batch.scan(sequences = [start_ixs, start_ixs+batch_size])
-        return out.reshape((-1, )+tuple(out.shape[i] for i in xrange(2, out.ndim)), ndim=out.ndim-1)
+
+        out = process_batch.scan(sequences = [start_ixs, start_ixs+batch_size], non_sequences = args, **scan_kwargs)
+        # out = theano.scan(process_batch, sequences = [start_ixs, start_ixs+batch_size])
+        if out is None:
+            return None
+        elif isinstance(out, Variable):
+            return out.reshape((-1, )+tuple(out.shape[i] for i in xrange(2, out.ndim)), ndim=out.ndim-1)
+        else:
+            return out.__class__(o.reshape((-1, )+tuple(o.shape[i] for i in xrange(2, p.ndim)), ndim=o.ndim-1) for o in out)
     return batch_function
 
 
diff --git a/plato/interfaces/test_helpers.py b/plato/interfaces/test_helpers.py
index 95f2a05..f9661ac 100644
--- a/plato/interfaces/test_helpers.py
+++ b/plato/interfaces/test_helpers.py
@@ -43,6 +43,21 @@ def add_them(a, b):
     assert np.allclose(out, arr_a+arr_b)
 
 
+def test_batch_without_return():
+
+    state = create_shared_variable(np.zeros(2))
+
+    @symbolic
+    def do_something_internal(a, b):
+        new_state = state+ a*b
+        add_update(state, new_state)
+        # return new_state
+
+    out = batchify_function(do_something_internal, batch_size=2).compile()(np.arange(6).astype(float), np.arange(1,7).astype(float))
+    assert out is None
+    assert np.array_equal(state.get_value(), [0*1+2*3+4*5, 1*2+3*4+5*6])
+
+
 def test_compute_in_with_state():
 
     @symbolic
@@ -105,3 +120,4 @@ def accumulate(x):
     test_compute_in_with_state()
     test_on_first_pass()
     test_reshaping_shared_variable()
+    test_batch_without_return()
diff --git a/plato/test_core.py b/plato/test_core.py
index f140be9..098f502 100644
--- a/plato/test_core.py
+++ b/plato/test_core.py
@@ -1,7 +1,7 @@
 from abc import abstractmethod
 
 from artemis.general.hashing import compute_fixed_hash, fixed_hash_eq
-from plato.interfaces.helpers import create_shared_variable
+from plato.interfaces.helpers import create_shared_variable, shared_like
 from plato.tools.common.config import hold_float_precision
 from pytest import raises
 from plato.core import symbolic_simple, symbolic_updater, SymbolicFormatError, \
@@ -607,6 +607,37 @@ def my_cumsum(x):
     assert np.array_equal(get_tdb_traces()['x_in_loop_catch_all'], np.arange(4)**3)
 
 
+def test_easy_scan_syntax():
+
+    @symbolic
+    def accumulator(v, shape):
+        accum = create_shared_variable(np.zeros(shape))
+        new_accum = accum + v
+        add_update(accum, new_accum)
+        return new_accum
+
+    x = np.random.randn(5, 3)
+    f = accumulator.partial(shape=x.shape[1:]).scan.compile()
+
+    assert np.allclose(f(x), np.cumsum(x, axis=0))
+
+
+def test_scan_no_return():
+
+    state = create_shared_variable(np.zeros(()))
+
+    @symbolic
+    def do_something_internal(a, b):
+        new_state = state+ a*b
+        add_update(state, new_state)
+
+    out = do_something_internal.scan.compile()(np.arange(6).astype(float), np.arange(1,7).astype(float))
+
+    assert out is None
+    assert np.array_equal(state.get_value(), np.arange(6).dot(np.arange(1, 7)))
+
+
+
 if __name__ == '__main__':
     test_ival_ishape()
     test_catch_sneaky_updates()
@@ -630,3 +661,5 @@ def my_cumsum(x):
     test_shared_input()
     test_function_reset()
     test_trace_var_in_scan()
+    test_easy_scan_syntax()
+    test_scan_no_return()
\ No newline at end of file
diff --git a/plato/tools/common/basic.py b/plato/tools/common/basic.py
index 2ead9e9..c9288ed 100644
--- a/plato/tools/common/basic.py
+++ b/plato/tools/common/basic.py
@@ -81,7 +81,7 @@ def running_mean_and_variance(data, decay = None, shape = None, elementwise=True
         var_new = s_new
     add_update(mean_last, mean_new)
     add_update(s_last, s_new)
-    return var_new
+    return mean_new, var_new
 
 
 @symbolic
@@ -93,4 +93,4 @@ def running_variance(data, decay=None, shape = None, elementwise=True, initial_v
     :param shape:
     :return:
     """
-    return running_mean_and_variance(data=data, decay=decay, shape=shape, elementwise=elementwise, initial_var = initial_value)
+    return running_mean_and_variance(data=data, decay=decay, shape=shape, elementwise=elementwise, initial_var = initial_value)[1]
diff --git a/plato/tools/common/config.py b/plato/tools/common/config.py
index b071c59..160ec7d 100644
--- a/plato/tools/common/config.py
+++ b/plato/tools/common/config.py
@@ -38,4 +38,3 @@ def hold_theano_optimizer(value):
     theano.config.optimizer = value
     yield
     theano.config.optimizer = old_val
-
diff --git a/plato/tools/common/online_predictors.py b/plato/tools/common/online_predictors.py
index 389b6d8..56dc8ff 100644
--- a/plato/tools/common/online_predictors.py
+++ b/plato/tools/common/online_predictors.py
@@ -1,4 +1,6 @@
 from abc import ABCMeta, abstractmethod
+from contextlib import contextmanager
+
 from plato.interfaces.decorators import symbolic_simple, symbolic_updater
 from plato.interfaces.interfaces import IParameterized
 from plato.tools.optimization.cost import get_named_cost_function
@@ -94,6 +96,25 @@ def parameters(self):
         return self._function.parameters + opt_params
 
 
+_LOCAL_LOSSES = None
+
+
+def declare_local_loss(loss):
+    if _LOCAL_LOSSES is not None:
+        _LOCAL_LOSSES.append(loss)
+
+
+@contextmanager
+def capture_local_losses():
+    global  _LOCAL_LOSSES
+    assert _LOCAL_LOSSES is None, "Local loss book already open"
+    _LOCAL_LOSSES = []
+    try:
+        yield _LOCAL_LOSSES
+    finally:
+        _LOCAL_LOSSES = None
+
+
 class CompiledSymbolicPredictor(IPredictor, IParameterized):
     """
     A Predictor containing the compiled methods for a SymbolicPredictor.
@@ -132,7 +153,12 @@ def __call__(self, x):
         raise NotImplementedError()
 
     def train(self, x, y, cost_fcn, optimizer, assert_all_params_optimized=False, regularization_cost = None):
-        cost = cost_fcn(self.train_call(x), y)
+        with capture_local_losses() as local_losses:
+            cost = cost_fcn(self.train_call(x), y)
+
+        if len(local_losses)>0:
+            cost = cost + sum(local_losses)
+
         if regularization_cost is not None:
             cost = cost + regularization_cost(self.parameters)
         if isinstance(optimizer, dict):
diff --git a/plato/tools/convnet/conv_specifiers.py b/plato/tools/convnet/conv_specifiers.py
index ecba4b3..158beee 100644
--- a/plato/tools/convnet/conv_specifiers.py
+++ b/plato/tools/convnet/conv_specifiers.py
@@ -1,6 +1,8 @@
 from artemis.fileman.primitive_specifiers import PrimativeSpecifier
+from artemis.general.numpy_helpers import get_rng
 from artemis.general.should_be_builtins import bad_value
-
+from artemis.ml.tools.neuralnets import initialize_weight_matrix
+import numpy as np
 __author__ = 'peter'
 
 
@@ -51,6 +53,16 @@ def __init__(self, w, b, mode):
         self.b=b
         self.mode = mode
 
+    @staticmethod
+    def from_init(k_shape, mode, mag='xavier', use_biases=True, rng=None):
+        n_out_maps, n_in_maps, k_size_y, k_size_x = k_shape
+        rng = get_rng(rng)
+        if mag == 'xavier':
+            fanin, fanout = n_in_maps*k_size_x*k_size_y, n_out_maps*k_size_x*k_size_y
+            w = 1./np.sqrt(fanin+fanout) * rng.randn(*k_shape)
+        b = np.zeros(n_out_maps) if use_biases else False
+        return ConvolverSpec(w, b, mode)
+
     def shape_transfer(self, (n_samples, n_maps, size_y, size_x)):
         return (n_samples, self.w.shape[0])+{
             'same': (size_y, size_x),
diff --git a/plato/tools/convnet/convnet.py b/plato/tools/convnet/convnet.py
index 7537aa2..94b7304 100644
--- a/plato/tools/convnet/convnet.py
+++ b/plato/tools/convnet/convnet.py
@@ -53,7 +53,7 @@ def from_spec(cls, spec):
         return ConvLayer(
             w=spec.w,
             b=spec.b,
-            border_mode= {'full': 0, 'same': 1, 'valid': 0}[spec.mode] if spec.mode in ('full', 'same', 'valid') else spec.mode,
+            border_mode= {'full': 0, 'same': 'half', 'valid': 0}[spec.mode] if spec.mode in ('full', 'same', 'valid') else spec.mode,
             filter_flip=False
             )
 
@@ -267,7 +267,7 @@ def specifier_to_layer(spec, force_shared_parameters=True, rng = None):
             w=spec.w,
             b=spec.b,
             force_shared_parameters=force_shared_parameters,
-            border_mode= {'full': 0, 'same': 1, 'valid': 0}[spec.mode] if spec.mode in ('full', 'same', 'valid') else spec.mode,
+            border_mode= {'full': 0, 'same': 'half', 'valid': 0}[spec.mode] if spec.mode in ('full', 'same', 'valid') else spec.mode,
             filter_flip=False
             ),
         NonlinearitySpec: lambda: Nonlinearity(spec.func),
diff --git a/plato/tools/mlp/manual_backprop_net.py b/plato/tools/mlp/manual_backprop_net.py
index 7bad6ae..1960cc6 100644
--- a/plato/tools/mlp/manual_backprop_net.py
+++ b/plato/tools/mlp/manual_backprop_net.py
@@ -1,4 +1,5 @@
 from abc import abstractmethod
+from collections import OrderedDict
 
 import numpy as np
 from artemis.general.should_be_builtins import izip_equal
@@ -13,14 +14,20 @@
 
 
 class ManualBackpropNet(ISymbolicPredictor):
-
+    """
+    A sequential (chain) network where you can insert layers that do backprop manually.
+    """
     def __init__(self, layers, optimizer, loss, prediction_minibatch_size=None, pass_loss = True):
         """
         :param layrs:
         :param optimizer:
         :param loss:
         """
-        self.layers = layers
+        if isinstance(layers, OrderedDict):
+            self.layer_names, self.layers = zip(*layers.items())
+        else:
+            self.layer_names = range(len(layers))
+            self.layers = layers
         self.optimizer = optimizer
         self.pass_loss = pass_loss
         self.loss = get_named_cost_function(loss) if isinstance(loss, basestring) else loss
@@ -35,7 +42,7 @@ def predict(self, x):
 
     def _predict_in_single_pass(self, x):
         for i, layer in enumerate(self.layers):
-            x = layer.forward_pass(x)
+            x = layer(x)
         return x
 
     @symbolic
@@ -43,20 +50,28 @@ def _predict_minibatch(self, start, end, x):
         return self.predict(x[start:end], _single_pass=True)
 
     @symbolic
-    def train(self, x, y):
-        states = {}
+    def forward_pass_and_state(self, x):
+        state = {}
         for layer in self.layers:
-            x, layer_state = layer.forward_pass_and_state(x)
-            states[layer]=layer_state
+            if isinstance(layer, IManualBackpropLayer):
+                x, layer_state = layer.forward_pass_and_state(x)
+            else:
+                out = layer(x)
+                layer_state = (x, out)
+                x = out
+            state[layer]=layer_state
+        return x, state
+
+    def backward_pass(self, state, grad, loss):
+        assert (grad is None) != (loss is None), 'Gove me a grad xor give me a loss.'
         layerwise_param_grad_pairs = []
-        loss = self.loss(x, y)
-        if self.pass_loss:
-            grad = None
-        else:
-            grad = tt.grad(loss, wrt=x)
-            loss = None
         for layer in self.layers[::-1]:
-            grad, param_grads = layer.backward_pass(state=states[layer], grad=grad, cost = loss)
+            if isinstance(layer, IManualBackpropLayer):
+                grad, param_grads = layer.backward_pass(state=state[layer], grad=grad, cost = loss)
+            else:
+                out, y = state[layer]
+                grads = tt.grad(cost=loss, wrt=[out] + list(layer.parameters), known_grads={y: grad} if grad is not None else None)
+                grad, param_grads = grads[0], grads[1:]
             loss = None
             layerwise_param_grad_pairs.append(list(izip_equal(layer.parameters, param_grads)))
         if isinstance(self.optimizer, IGradientOptimizer):
@@ -66,7 +81,17 @@ def train(self, x, y):
             for optimizer, layer_pairs in izip_equal(self.optimizer, layerwise_param_grad_pairs):
                 params, grads = zip(*layer_pairs)
                 optimizer.update_from_gradients(parameters=params, gradients=grads)
-        return create_constant(0.)  # scan demands some return
+
+    @symbolic
+    def train(self, x, y):
+        out, state = self.forward_pass_and_state(x)
+        loss = self.loss(out, y)
+        if self.pass_loss:
+            grad = None
+        else:
+            grad = tt.grad(loss, wrt=out)
+            loss = None
+        self.backward_pass(state, grad, loss)
 
     @property
     def parameters(self):
@@ -83,6 +108,9 @@ def forward_pass(self, x):
         out, _ = self.forward_pass_and_state(x)
         return out
 
+    def __call__(self, *args):
+        return self.forward_pass(*args)
+
     @abstractmethod
     def forward_pass_and_state(self, x):
         """
@@ -93,7 +121,7 @@ def forward_pass_and_state(self, x):
                 state is a list of state-variables to be passed into the backward pass.
                 Importantly, they must be in order (so that the last element of state is the one used to compute the gradient)
         """
-
+        raise NotImplementedError()
 
     @abstractmethod
     def backward_pass(self, state, grad, cost):
@@ -102,6 +130,20 @@ def backward_pass(self, state, grad, cost):
         :param grad: The incoming gradient
         :return: The outgoing gradient
         """
+        raise NotImplementedError()
+
+
+class ManualBackPropChain(IManualBackpropLayer):
+
+    def __init__(self, layers):
+        if isinstance(layers, OrderedDict):
+            self.layer_names, self.layers = zip(*layers.items())
+        else:
+            self.layer_names = range(len(layers))
+            self.layers = layers
+
+
+
 
 
 class ExactBackpropLayer(IManualBackpropLayer):
diff --git a/plato/tools/mlp/mlp.py b/plato/tools/mlp/mlp.py
index c97a2e6..2b86baa 100644
--- a/plato/tools/mlp/mlp.py
+++ b/plato/tools/mlp/mlp.py
@@ -28,8 +28,8 @@ def __call__(self, x):
         return x
 
     @symbolic
-    def get_layer_activations(self, x):
-        activations = []
+    def get_layer_activations(self, x, include_input = False):
+        activations = [x] if include_input else []
         for lay in self.layers:
             x = lay(x)
             activations.append(x)
diff --git a/plato/tools/mlp/test_manual_backprop_net.py b/plato/tools/mlp/test_manual_backprop_net.py
index d606a3d..6cc5a19 100644
--- a/plato/tools/mlp/test_manual_backprop_net.py
+++ b/plato/tools/mlp/test_manual_backprop_net.py
@@ -25,30 +25,45 @@ def test_exact_manual_backprop_net():
         optimizer = GradientDescent(0.1),
         loss = 'softmax-xe'
         )
+    stick_shifted_by_robot = ManualBackpropNet(
+        layers = MultiLayerPerceptron.from_weights(weights=ws, hidden_activations='relu', output_activation='linear').layers,
+        optimizer = GradientDescent(0.1),
+        loss = 'softmax-xe'
+        )
 
     # Check forward passes match
     fp_auto = auto_mlp.predict.compile()
     fp_stick = stick_mlp.predict.compile()
+    fp_robot = stick_shifted_by_robot.predict.compile()
+
     out_auto = fp_auto(x)
     out_stick = fp_stick(x)
+    out_robot = fp_robot(x)
     assert np.allclose(out_auto, out_stick)
+    assert np.allclose(out_auto, out_robot)
 
     # 1 Iteration of training
     ft_auto = auto_mlp.train.compile()
     ft_stick = stick_mlp.train.compile()
+    ft_robot = stick_shifted_by_robot.train.compile()
     ft_auto(x, y)
     ft_stick(x, y)
+    ft_robot(x, y)
 
     # Check parameter changes match
     dw0_auto = auto_mlp._function.layers[0].linear_transform.w.get_value() - ws[0]
     dw0_stick = stick_mlp.layers[0].linear_transform.w.get_value() - ws[0]
+    dw0_robot = stick_shifted_by_robot.layers[0].linear_transform.w.get_value() - ws[0]
     assert np.allclose(dw0_auto, dw0_stick)
+    assert np.allclose(dw0_auto, dw0_robot)
 
     # Check outputs match
     new_out_auto = fp_auto(x)
     new_out_stick = fp_stick(x)
+    new_out_robot = fp_robot(x)
     assert np.allclose(new_out_auto, new_out_stick)
     assert not np.allclose(new_out_stick, out_auto)
+    assert np.allclose(new_out_auto, new_out_robot)
 
 
 if __name__ == '__main__':
diff --git a/plato/tools/optimization/cost.py b/plato/tools/optimization/cost.py
index ef23e2b..32a5eac 100644
--- a/plato/tools/optimization/cost.py
+++ b/plato/tools/optimization/cost.py
@@ -61,7 +61,11 @@ def negative_log_likelihood_dangerous(actual, target):
 
 @symbolic_simple
 def mean_squared_error(actual, target):
-    return tt.mean(tt.sum((actual-target)**2, axis = 1), axis = 0)
+    if actual.ndim==2:
+        return tt.mean(tt.sum((actual-target)**2, axis = 1), axis = 0)
+    else:
+        return tt.mean(tt.sum((actual.flatten(2)-target.flatten(2))**2, axis = 1), axis = 0)
+
 
 
 @symbolic_simple

From 1c17d5345c5e0d5d1ef8a53ff679dc4db06f5604 Mon Sep 17 00:00:00 2001
From: peter <peter.ed.oconnor@gmail.com>
Date: Tue, 17 Oct 2017 22:03:40 +0200
Subject: [PATCH 11/29] fixes

---
 plato/interfaces/interfaces.py         |  8 +++++
 plato/tools/mlp/manual_backprop_net.py | 45 ++++++++++++++------------
 2 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/plato/interfaces/interfaces.py b/plato/interfaces/interfaces.py
index 0a38516..c1edfb3 100644
--- a/plato/interfaces/interfaces.py
+++ b/plato/interfaces/interfaces.py
@@ -20,6 +20,14 @@ def set_parameter_states(self, states):
             p.set_value(s)
 
 
+def get_parameters(obj):
+
+    if isinstance(obj, IParameterized) or hasattr(obj, 'parameters'):
+        return obj.parameters
+    else:
+        return []
+
+
 class IFreeEnergy(object):
 
     __metaclass__ = ABCMeta
diff --git a/plato/tools/mlp/manual_backprop_net.py b/plato/tools/mlp/manual_backprop_net.py
index 1d916fd..bc413f5 100644
--- a/plato/tools/mlp/manual_backprop_net.py
+++ b/plato/tools/mlp/manual_backprop_net.py
@@ -2,9 +2,11 @@
 from collections import OrderedDict
 
 import numpy as np
+
+from artemis.general.nested_structures import get_leaf_values, NestedType
 from artemis.general.should_be_builtins import izip_equal
 from plato.core import create_constant, symbolic
-from plato.interfaces.helpers import batchify_function, get_named_activation_function
+from plato.interfaces.helpers import batchify_function, get_named_activation_function, get_parameters_or_not
 from plato.interfaces.interfaces import IParameterized
 from plato.tools.common.online_predictors import ISymbolicPredictor
 from plato.tools.mlp.mlp import FullyConnectedTransform
@@ -52,13 +54,13 @@ def train(self, x, y):
         else:
             grad = tt.grad(loss, wrt=out)
             loss = None
-        _, layerwise_param_grad_pairs = backward_pass(self.model, state=state, grad=grad, loss=loss)
+        _, param_grad_pairs = backward_pass(self.model, state=state, grad=grad, loss=loss)
 
         if isinstance(self.optimizer, IGradientOptimizer):
-            all_params, all_param_grads = zip(*[(p, g) for layer_pairs in layerwise_param_grad_pairs for p, g in layer_pairs])
+            all_params, all_param_grads = zip(*[(p, g) for p, g in param_grad_pairs])
             self.optimizer.update_from_gradients(parameters=all_params, gradients=all_param_grads)
         elif isinstance(self.optimizer, (list, tuple)):
-            for optimizer, layer_pairs in izip_equal(self.optimizer, layerwise_param_grad_pairs):
+            for optimizer, layer_pairs in izip_equal(self.optimizer, param_grad_pairs):
                 params, grads = zip(*layer_pairs)
                 optimizer.update_from_gradients(parameters=params, gradients=grads)
 
@@ -78,7 +80,6 @@ def forward_pass(self, x):
         out, _ = self.forward_pass_and_state(x)
         return out
 
-
     def __call__(self, *args):
         return self.forward_pass(*args)
 
@@ -115,13 +116,14 @@ def forward_pass_and_state(layer, x):
 
 def backward_pass(layer, state, grad, loss):
     if isinstance(layer, IManualBackpropLayer):
-        grad, param_grads = layer.backward_pass(state=state, grad=grad, loss= loss)
+        grad_inputs, param_grad_pairs = layer.backward_pass(state=state, grad=grad, loss= loss)
     else:
-        out, y = state
-        grads = tt.grad(cost=loss, wrt=[out] + list(layer.parameters), known_grads={y: grad} if grad is not None else None)
-        grad, param_grads = grads[0], grads[1:]
-
-    return grad, param_grads
+        inputs, y = state
+        params = list(get_parameters_or_not(layer))
+        grad_inputs = tt.grad(cost=loss, wrt=inputs, known_grads={y: grad} if grad is not None else None)
+        grad_params = tt.grad(cost=loss, wrt=params, known_grads={y: grad} if grad is not None else None)
+        param_grad_pairs = [(p, g) for p, g in izip_equal(params, grad_params)]
+    return grad_inputs, param_grad_pairs
 
 
 class ChainNetwork(IManualBackpropLayer):
@@ -144,12 +146,12 @@ def forward_pass_and_state(self, x):
     @symbolic
     def backward_pass(self, state, grad, loss):
         assert (grad is None) != (loss is None), 'Gove me a grad xor give me a loss.'
-        layerwise_param_grad_pairs = []
+        param_grad_pairs = []
         for layer in self.layers[::-1]:
-            grad, param_grads = backward_pass(layer, state[layer], grad, loss)
+            grad, layer_param_grad_pairs = backward_pass(layer, state[layer], grad, loss)
             loss = None
-            layerwise_param_grad_pairs.append(list(izip_equal(layer.parameters, param_grads)))
-        return grad, layerwise_param_grad_pairs
+            param_grad_pairs += layer_param_grad_pairs
+        return grad, param_grad_pairs
 
     @property
     def parameters(self):
@@ -178,14 +180,14 @@ def forward_pass_and_state(self, (x1, x2)):
         out1, state1 = forward_pass_and_state(self.f_siamese, x1)
         out2, state2 = forward_pass_and_state(self.f_siamese, x2)
         out, state_merge = forward_pass_and_state(self.f_merge, (out1, out2))
-        return out, state_merge
+        return out, (state1, state2, state_merge)
 
     @symbolic
     def backward_pass(self, state, grad, loss):
         state1, state2, state_merge = state
-        grad, merge_param_grads = backward_pass(self.f_merge, state=state_merge, grad=grad, loss=loss)
-        grad1, f1_param_grads = backward_pass(self.f_siamese, state=state1, grad=grad, loss=None)
-        grad2, f2_param_grads = backward_pass(self.f_siamese, state=state2, grad=grad, loss=None)
+        (out_grad_1, out_grad_2), merge_param_grads = backward_pass(self.f_merge, state=state_merge, grad=grad, loss=loss)
+        grad1, f1_param_grads = backward_pass(self.f_siamese, state=state1, grad=out_grad_1, loss=None)
+        grad2, f2_param_grads = backward_pass(self.f_siamese, state=state2, grad=out_grad_2, loss=None)
         assert all(param1 is param2 for (param1, _), (param2, _) in zip(f1_param_grads, f2_param_grads))
         param_grads = [(p1, v1+v2) for (p1, v1), (p2, v2) in zip(f1_param_grads, f2_param_grads)] + merge_param_grads
         return (grad1, grad2), param_grads
@@ -230,9 +232,10 @@ def backward_pass(self, state, grad, loss):
             dcdw = x.T.dot(dcdp)  # Because I think if we did this directly for the ws we'd be in trouble
             dcdb = dcdp.sum(axis=0)
             dcdx = dcdp.dot(self.linear_transform.w.T)
-            return dcdx, [dcdw, dcdb]
+            return dcdx, list(izip_equal(self.linear_transform.parameters, [dcdw, dcdb]))
         else:
-            return tt.grad(loss, wrt=x), tt.grad(loss, wrt=self.linear_transform.parameters)
+            param_grads = tt.grad(loss, wrt=self.linear_transform.parameters)
+            return tt.grad(loss, wrt=x), list(izip_equal(self.linear_transform.parameters, param_grads))
 
     @property
     def parameters(self):

From 11601b5d1a26c9464d0a5f00f49775c9c121627f Mon Sep 17 00:00:00 2001
From: peter <peter.ed.oconnor@gmail.com>
Date: Wed, 18 Oct 2017 13:40:12 +0200
Subject: [PATCH 12/29] sfdfds

---
 plato/tools/mlp/manual_backprop_net.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/plato/tools/mlp/manual_backprop_net.py b/plato/tools/mlp/manual_backprop_net.py
index bc413f5..c96a132 100644
--- a/plato/tools/mlp/manual_backprop_net.py
+++ b/plato/tools/mlp/manual_backprop_net.py
@@ -240,3 +240,8 @@ def backward_pass(self, state, grad, loss):
     @property
     def parameters(self):
         return self.linear_transform.parameters
+
+# woooo
+#fdsfdsf
+
+# ccccc
\ No newline at end of file

From 6851e7e19829fefc69b430f5852492e17e7c3728 Mon Sep 17 00:00:00 2001
From: peter <peter.ed.oconnor@gmail.com>
Date: Wed, 18 Oct 2017 17:26:56 +0200
Subject: [PATCH 13/29] addinglayer

---
 plato/tools/mlp/manual_backprop_net.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/plato/tools/mlp/manual_backprop_net.py b/plato/tools/mlp/manual_backprop_net.py
index c96a132..0b15712 100644
--- a/plato/tools/mlp/manual_backprop_net.py
+++ b/plato/tools/mlp/manual_backprop_net.py
@@ -197,9 +197,23 @@ def parameters(self):
         return self.f_siamese.parameters + self.f_merge.parameters
 
 
+class AddingLayer(IManualBackpropLayer):
+
+    def forward_pass_and_state(self, (x1, x2)):
+        return x1+x2, None
+
+    def backward_pass(self, state, grad, loss):
+        return (grad, grad), []
+
+    @property
+    def parameters(self):
+        return []
+
 class ExactBackpropLayer(IManualBackpropLayer):
     """
     Performs the function of a layer.
+
+    (Not really useful, since you can now just feed any old function into a manual backprop net)
     """
 
     def __init__(self, linear_transform, nonlinearity):

From 0ae9ac0fb24c5738248da80c49c080f6a70b3773 Mon Sep 17 00:00:00 2001
From: Peter O'Connor <peter.ed.oconnor@gmail.com>
Date: Thu, 19 Oct 2017 15:04:56 +0200
Subject: [PATCH 14/29] dfsdf

---
 plato/tools/convnet/convnet.py         | 12 ++++++++++++
 plato/tools/mlp/manual_backprop_net.py |  9 +++++++++
 2 files changed, 21 insertions(+)

diff --git a/plato/tools/convnet/convnet.py b/plato/tools/convnet/convnet.py
index 94b7304..f79d2a7 100644
--- a/plato/tools/convnet/convnet.py
+++ b/plato/tools/convnet/convnet.py
@@ -80,6 +80,18 @@ def from_spec(cls, spec):
         return Nonlinearity(spec.func)
 
 
+@symbolic
+class CrossConvLayer(object):
+
+    def __init__(self, ):
+
+    def __call__(self, (x1, x2)):
+        """
+        (x1, x2) are each n_samples, n_maps
+        :return:
+        """
+        map = tt.nnet.conv2d(input=x1, filters=x1, )
+
 @symbolic
 class Pooler(FeedForwardModule):
 
diff --git a/plato/tools/mlp/manual_backprop_net.py b/plato/tools/mlp/manual_backprop_net.py
index 0b15712..44f03fd 100644
--- a/plato/tools/mlp/manual_backprop_net.py
+++ b/plato/tools/mlp/manual_backprop_net.py
@@ -209,6 +209,15 @@ def backward_pass(self, state, grad, loss):
     def parameters(self):
         return []
 
+
+@symbolic
+class ConcatenationLayer(object):
+
+    def __call__(self, (x1, x2)):
+        return tt.concatenate([x1.flatten(2), x2.flatten(2)], axis=1)
+
+
+
 class ExactBackpropLayer(IManualBackpropLayer):
     """
     Performs the function of a layer.

From ee24f5d2eb8f45e353d0eacc22e8fb1b119808d5 Mon Sep 17 00:00:00 2001
From: Peter O'Connor <peter.ed.oconnor@gmail.com>
Date: Thu, 19 Oct 2017 15:05:20 +0200
Subject: [PATCH 15/29] fdsfds

---
 plato/tools/convnet/convnet.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/plato/tools/convnet/convnet.py b/plato/tools/convnet/convnet.py
index f79d2a7..094e044 100644
--- a/plato/tools/convnet/convnet.py
+++ b/plato/tools/convnet/convnet.py
@@ -84,6 +84,7 @@ def from_spec(cls, spec):
 class CrossConvLayer(object):
 
     def __init__(self, ):
+        pass
 
     def __call__(self, (x1, x2)):
         """

From ee89481eb05ebc199fcee8f22f53837315a75af9 Mon Sep 17 00:00:00 2001
From: Peter O'Connor <peter.ed.oconnor@gmail.com>
Date: Thu, 19 Oct 2017 16:57:42 +0200
Subject: [PATCH 16/29] crosscorrlayer works

---
 plato/tools/convnet/conv_specifiers.py |  2 +-
 plato/tools/convnet/convnet.py         | 30 ++++++++++++++++++++------
 plato/tools/convnet/test_convnet.py    | 27 ++++++++++++++++++-----
 3 files changed, 46 insertions(+), 13 deletions(-)

diff --git a/plato/tools/convnet/conv_specifiers.py b/plato/tools/convnet/conv_specifiers.py
index 158beee..c95636e 100644
--- a/plato/tools/convnet/conv_specifiers.py
+++ b/plato/tools/convnet/conv_specifiers.py
@@ -48,7 +48,7 @@ def __init__(self, w, b, mode):
         """
         assert w.ndim==4
         assert b is False or (b.ndim==1 and w.shape[0] == len(b)), "Number of output maps must match"
-        assert isinstance(mode, int) or mode in ('same', 'valid', 'full'), 'Mode "%s" not allowed' % (mode, )
+        assert isinstance(mode, int) or mode in ('same', 'valid', 'full', 'half'), 'Mode "%s" not allowed' % (mode, )
         self.w=w
         self.b=b
         self.mode = mode
diff --git a/plato/tools/convnet/convnet.py b/plato/tools/convnet/convnet.py
index 094e044..e83daa5 100644
--- a/plato/tools/convnet/convnet.py
+++ b/plato/tools/convnet/convnet.py
@@ -1,4 +1,6 @@
 from collections import OrderedDict
+from functools import partial
+
 import numpy as np
 import theano
 import theano.tensor as tt
@@ -81,17 +83,30 @@ def from_spec(cls, spec):
 
 
 @symbolic
-class CrossConvLayer(object):
+class ChannelwiseCrossCorr(object):
 
-    def __init__(self, ):
-        pass
+    def __init__(self, border_mode='full', subsample=(1, 1)):
+        self.border_mode = border_mode
+        self.subsample = subsample
 
     def __call__(self, (x1, x2)):
         """
-        (x1, x2) are each n_samples, n_maps
-        :return:
+        (x1, x2) are each (n_samples, n_channels, size_y, size_x) images
+        :return: A (n_samples, n_channels, size_y, size_x) image representing the channelwise cross-convolution between
+            each pair of images.
         """
-        map = tt.nnet.conv2d(input=x1, filters=x1, )
+        from theano.tensor.signal.conv import conv2d as sconv2d
+
+        # Flatten samples, channels
+
+        x1_flat = x1.reshape((x1.shape[0]*x1.shape[1], x2.shape[2], x2.shape[3]))
+        x2_flat = x2.reshape((x2.shape[0]*x2.shape[1], x2.shape[2], x2.shape[3]))[:, ::-1, ::-1]
+
+        map_flat, _ = theano.scan(partial(sconv2d, border_mode=self.border_mode, subsample=self.subsample), sequences=[x1_flat, x2_flat])
+
+        conv_maps = map_flat.reshape((x1.shape[0], x1.shape[1], map_flat.shape[1], map_flat.shape[2]))
+
+        return conv_maps
 
 @symbolic
 class Pooler(FeedForwardModule):
@@ -263,7 +278,8 @@ def parameters(self):
         return sum([l.parameters if isinstance(l, IParameterized) else [] for l in self.layers.values()], [])
 
     def to_spec(self):
-        return ConvNetSpec(OrderedDict((layer_name, lay.to_spec()) for layer_name, lay in self.layers.iteritems()))
+        # return ConvNetSpec(OrderedDict((layer_name, lay.to_spec()) for layer_name, lay in self.layers.iteritems()))
+        return OrderedDict((layer_name, lay.to_spec()) for layer_name, lay in self.layers.iteritems())
 
     @classmethod
     def from_spec(cls, spec):
diff --git a/plato/tools/convnet/test_convnet.py b/plato/tools/convnet/test_convnet.py
index ab97f1b..686c268 100644
--- a/plato/tools/convnet/test_convnet.py
+++ b/plato/tools/convnet/test_convnet.py
@@ -1,17 +1,18 @@
-from collections import OrderedDict
 import pickle
+from collections import OrderedDict
 
 import numpy as np
 
+from artemis.general.mymath import argmaxnd
+from artemis.ml.datasets.cifar import get_cifar_10_dataset
+from artemis.ml.predictors.train_and_test import percent_argmax_correct
 from plato.tools.common.online_predictors import GradientBasedPredictor
 from plato.tools.common.training import assess_online_symbolic_predictor
 from plato.tools.convnet.conv_specifiers import ConvInitSpec, NonlinearitySpec, PoolerSpec
-from plato.tools.convnet.convnet import ConvNet, ConvLayer, Pooler, normalize_convnet, Nonlinearity
+from plato.tools.convnet.convnet import ConvNet, ConvLayer, Pooler, normalize_convnet, Nonlinearity, \
+    ChannelwiseCrossCorr
 from plato.tools.optimization.cost import negative_log_likelihood_dangerous
 from plato.tools.optimization.optimizers import AdaMax
-from artemis.ml.predictors.train_and_test import percent_argmax_correct
-from artemis.ml.datasets.cifar import get_cifar_10_dataset
-
 
 __author__ = 'peter'
 
@@ -92,6 +93,22 @@ def test_normalize_convnet():
         assert 0.9999 < act[layer_name].std() < 1.0001
 
 
+def test_cross_conv_layer():
+
+    x_shift, y_shift = 3, -5
+    rng = np.random.RandomState(1234)
+    full_x = rng.randn(1, 10, 40, 40)
+    x1 = full_x[:, :, 10:30, 10:30]
+    x2 = full_x[:, :, 10+y_shift:30+y_shift, 10+x_shift:30+x_shift]
+    func = ChannelwiseCrossCorr().compile()
+    y = func((x1, x2))
+    assert y.shape==(1, 10, 39, 39)
+    # dbplot(y)
+    ixs = np.array([argmaxnd(y[0, i, :, :]) for i in xrange(10)])
+    assert np.all(ixs-39//2 == (y_shift, x_shift))
+
+
 if __name__ == '__main__':
     test_convnet_serialization()
     test_normalize_convnet()
+    test_cross_conv_layer()

From df31f3338d0035cadb6e4ac977ad8095c7fb9f5e Mon Sep 17 00:00:00 2001
From: Peter O'Connor <peter.ed.oconnor@gmail.com>
Date: Thu, 19 Oct 2017 19:36:28 +0200
Subject: [PATCH 17/29] crosscorr

---
 plato/tools/convnet/convnet.py      | 15 ++++++---------
 plato/tools/convnet/test_convnet.py |  7 ++++---
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/plato/tools/convnet/convnet.py b/plato/tools/convnet/convnet.py
index e83daa5..c2029fe 100644
--- a/plato/tools/convnet/convnet.py
+++ b/plato/tools/convnet/convnet.py
@@ -4,6 +4,8 @@
 import numpy as np
 import theano
 import theano.tensor as tt
+from theano.tensor.nnet import conv3d2d
+
 from artemis.general.numpy_helpers import get_rng
 from plato.core import symbolic, create_shared_variable
 from plato.interfaces.helpers import get_named_activation_function, get_theano_rng
@@ -91,21 +93,15 @@ def __init__(self, border_mode='full', subsample=(1, 1)):
 
     def __call__(self, (x1, x2)):
         """
-        (x1, x2) are each (n_samples, n_channels, size_y, size_x) images
-        :return: A (n_samples, n_channels, size_y, size_x) image representing the channelwise cross-convolution between
+        :param (x1, x2): are each (n_samples, n_channels, size_y, size_x) images
+        :return: A (n_samples, n_channels, size_y*2-1, size_x*2-1) image representing the channelwise cross-correlation between
             each pair of images.
         """
         from theano.tensor.signal.conv import conv2d as sconv2d
-
-        # Flatten samples, channels
-
         x1_flat = x1.reshape((x1.shape[0]*x1.shape[1], x2.shape[2], x2.shape[3]))
         x2_flat = x2.reshape((x2.shape[0]*x2.shape[1], x2.shape[2], x2.shape[3]))[:, ::-1, ::-1]
-
         map_flat, _ = theano.scan(partial(sconv2d, border_mode=self.border_mode, subsample=self.subsample), sequences=[x1_flat, x2_flat])
-
         conv_maps = map_flat.reshape((x1.shape[0], x1.shape[1], map_flat.shape[1], map_flat.shape[2]))
-
         return conv_maps
 
 @symbolic
@@ -133,7 +129,8 @@ def __call__(self, x):
         :param x: An (n_samples, n_maps, size_y, size_x) tensor
         :return: An (n_sample, n_maps, size_y/ds[0], size_x/ds[1]) tensor
         """
-        return pool_2d(x, ds = self.region, st = self.stride, mode = self.mode, ignore_border=True)
+        # return pool_2d(x, ds = self.region, st = self.stride, mode = self.mode, ignore_border=True)
+        return pool_2d(x, ws = self.region, stride = self.stride, mode = self.mode, ignore_border=True)
 
     def to_spec(self):
         return PoolerSpec(region = self.region, stride=self.stride, mode=self.mode)
diff --git a/plato/tools/convnet/test_convnet.py b/plato/tools/convnet/test_convnet.py
index 686c268..7ad1ca5 100644
--- a/plato/tools/convnet/test_convnet.py
+++ b/plato/tools/convnet/test_convnet.py
@@ -2,6 +2,7 @@
 from collections import OrderedDict
 
 import numpy as np
+from artemis.plotting.db_plotting import dbplot
 
 from artemis.general.mymath import argmaxnd
 from artemis.ml.datasets.cifar import get_cifar_10_dataset
@@ -103,12 +104,12 @@ def test_cross_conv_layer():
     func = ChannelwiseCrossCorr().compile()
     y = func((x1, x2))
     assert y.shape==(1, 10, 39, 39)
-    # dbplot(y)
+    dbplot(y)
     ixs = np.array([argmaxnd(y[0, i, :, :]) for i in xrange(10)])
     assert np.all(ixs-39//2 == (y_shift, x_shift))
 
 
 if __name__ == '__main__':
-    test_convnet_serialization()
-    test_normalize_convnet()
+    # test_convnet_serialization()
+    # test_normalize_convnet()
     test_cross_conv_layer()

From 2067441bc1772671c9ac45493d3476af8867a465 Mon Sep 17 00:00:00 2001
From: Peter O'Connor <peter.ed.oconnor@gmail.com>
Date: Thu, 19 Oct 2017 21:10:53 +0200
Subject: [PATCH 18/29] eeh

---
 plato/tools/mlp/manual_backprop_net.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/plato/tools/mlp/manual_backprop_net.py b/plato/tools/mlp/manual_backprop_net.py
index 44f03fd..93d5f61 100644
--- a/plato/tools/mlp/manual_backprop_net.py
+++ b/plato/tools/mlp/manual_backprop_net.py
@@ -19,7 +19,7 @@ class ManualBackpropNet(ISymbolicPredictor):
     """
     A sequential (chain) network where you can insert layers that do backprop manually.
     """
-    def __init__(self, layers, optimizer, loss, prediction_minibatch_size=None, pass_loss = True):
+    def __init__(self, layers, optimizer, loss, prediction_minibatch_size=None, pass_loss = True, params_to_train = None):
         """
         :param layrs:
         :param optimizer:
@@ -33,6 +33,7 @@ def __init__(self, layers, optimizer, loss, prediction_minibatch_size=None, pass
         self.pass_loss = pass_loss
         self.loss = get_named_cost_function(loss) if isinstance(loss, basestring) else loss
         self.prediction_minibatch_size = prediction_minibatch_size
+        self.params_to_train = params_to_train
 
     @symbolic
     def predict(self, x):
@@ -56,6 +57,11 @@ def train(self, x, y):
             loss = None
         _, param_grad_pairs = backward_pass(self.model, state=state, grad=grad, loss=loss)
 
+        if self.params_to_train is not None:
+            params_in_net = set(p for p, g in param_grad_pairs)
+            assert params_in_net.issuperset(self.params_to_train), 'You listed parameters to train {} which were not in the model'.format(set(self.params_to_train).difference(params_in_net))
+            param_grad_pairs = [(p, g) for p, g in param_grad_pairs if p in self.params_to_train]
+
         if isinstance(self.optimizer, IGradientOptimizer):
             all_params, all_param_grads = zip(*[(p, g) for p, g in param_grad_pairs])
             self.optimizer.update_from_gradients(parameters=all_params, gradients=all_param_grads)
@@ -155,7 +161,7 @@ def backward_pass(self, state, grad, loss):
 
     @property
     def parameters(self):
-        return [p for layer in self.layers for p in layer.parameters]
+        return [p for layer in self.layers for p in get_parameters_or_not(layer)]
 
 
 class SiameseNetwork(IManualBackpropLayer):

From 171c9f3e111d681e02b1bad915d09ce78c7ac60b Mon Sep 17 00:00:00 2001
From: Peter O'Connor <peter.ed.oconnor@gmail.com>
Date: Sat, 21 Oct 2017 00:01:34 +0200
Subject: [PATCH 19/29] ok

---
 plato/tools/mlp/manual_backprop_net.py | 44 ++++++++++++++++++--------
 1 file changed, 30 insertions(+), 14 deletions(-)

diff --git a/plato/tools/mlp/manual_backprop_net.py b/plato/tools/mlp/manual_backprop_net.py
index 93d5f61..20ae160 100644
--- a/plato/tools/mlp/manual_backprop_net.py
+++ b/plato/tools/mlp/manual_backprop_net.py
@@ -63,7 +63,7 @@ def train(self, x, y):
             param_grad_pairs = [(p, g) for p, g in param_grad_pairs if p in self.params_to_train]
 
         if isinstance(self.optimizer, IGradientOptimizer):
-            all_params, all_param_grads = zip(*[(p, g) for p, g in param_grad_pairs])
+            all_params, all_param_grads = zip(*[(p, g) for p, g in param_grad_pairs]) if len(param_grad_pairs)>0 else ([], [])
             self.optimizer.update_from_gradients(parameters=all_params, gradients=all_param_grads)
         elif isinstance(self.optimizer, (list, tuple)):
             for optimizer, layer_pairs in izip_equal(self.optimizer, param_grad_pairs):
@@ -164,43 +164,59 @@ def parameters(self):
         return [p for layer in self.layers for p in get_parameters_or_not(layer)]
 
 
+class IdentityLayer(object):
+
+    def __call__(self, x):
+        return x
+
+
 class SiameseNetwork(IManualBackpropLayer):
     """
     Implements:
 
-        y = f_merge(f_siamese(x1), f_siamese(x2))
+        y = f_merge(f1(f_siamese(x1)), f2(f_siamese(x2)))
 
     """
 
-    def __init__(self, f_siamese, f_merge):
+    def __init__(self, f_siamese, f_merge, f1 = IdentityLayer(), f2 = IdentityLayer()):
         """
         :param f_siamese: A function or ManualBackpropLayer of the form f(
         :param f_merge:
         :return:
         """
         self.f_siamese = f_siamese
+        self.f1 = f1
+        self.f2 = f2
         self.f_merge = f_merge
 
     @symbolic
     def forward_pass_and_state(self, (x1, x2)):
-        out1, state1 = forward_pass_and_state(self.f_siamese, x1)
-        out2, state2 = forward_pass_and_state(self.f_siamese, x2)
-        out, state_merge = forward_pass_and_state(self.f_merge, (out1, out2))
-        return out, (state1, state2, state_merge)
+        out1a, state1a = forward_pass_and_state(self.f_siamese, x1)
+        out2a, state2a = forward_pass_and_state(self.f_siamese, x2)
+
+        out1b, state1b = forward_pass_and_state(self.f1, out1a)
+        out2b, state2b = forward_pass_and_state(self.f2, out2a)
+
+        out, state_merge = forward_pass_and_state(self.f_merge, (out1b, out2b))
+        return out, (state1a, state2a, state1b, state2b, state_merge)
 
     @symbolic
     def backward_pass(self, state, grad, loss):
-        state1, state2, state_merge = state
-        (out_grad_1, out_grad_2), merge_param_grads = backward_pass(self.f_merge, state=state_merge, grad=grad, loss=loss)
-        grad1, f1_param_grads = backward_pass(self.f_siamese, state=state1, grad=out_grad_1, loss=None)
-        grad2, f2_param_grads = backward_pass(self.f_siamese, state=state2, grad=out_grad_2, loss=None)
-        assert all(param1 is param2 for (param1, _), (param2, _) in zip(f1_param_grads, f2_param_grads))
-        param_grads = [(p1, v1+v2) for (p1, v1), (p2, v2) in zip(f1_param_grads, f2_param_grads)] + merge_param_grads
+        state1a, state2a, state1b, state2b, state_merge = state
+        (grad_out1b, grad_out2b), merge_param_grads = backward_pass(self.f_merge, state=state_merge, grad=grad, loss=loss)
+        grad_out1a, param_grads_1b = backward_pass(self.f1, state = state1b, grad=grad_out1b, loss=None)
+        grad_out2a, param_grads_2b = backward_pass(self.f2, state = state2b, grad=grad_out2b, loss=None)
+        grad1, param_grads_1a = backward_pass(self.f_siamese, state=state1a, grad=grad_out1a, loss=None)
+        grad2, param_grads_2a = backward_pass(self.f_siamese, state=state2a, grad=grad_out2a, loss=None)
+
+        assert all(param1 is param2 for (param1, _), (param2, _) in zip(param_grads_1a, param_grads_2a))
+        param_grads_siamese = [(p1, v1+v2) for (p1, v1), (p2, v2) in zip(param_grads_1a, param_grads_2a)]
+        param_grads = param_grads_siamese + param_grads_1b + param_grads_2b + merge_param_grads
         return (grad1, grad2), param_grads
 
     @property
     def parameters(self):
-        return self.f_siamese.parameters + self.f_merge.parameters
+        return get_parameters_or_not(self.f_siamese) + get_parameters_or_not(self.f_merge)
 
 
 class AddingLayer(IManualBackpropLayer):

From 96a9cee68fd6b30c821ba44bf4a16dba8df7f9d0 Mon Sep 17 00:00:00 2001
From: Peter O'Connor <peter.ed.oconnor@gmail.com>
Date: Sun, 22 Oct 2017 01:09:15 +0200
Subject: [PATCH 20/29] aah

---
 plato/core.py                          | 19 +++++++++++--
 plato/tools/convnet/convnet.py         | 23 ++++++++++++++--
 plato/tools/convnet/test_convnet.py    | 10 +++++++
 plato/tools/mlp/manual_backprop_net.py | 38 ++++++++++++++++++++++++--
 4 files changed, 82 insertions(+), 8 deletions(-)

diff --git a/plato/core.py b/plato/core.py
index d25fbdb..ef7d7e3 100644
--- a/plato/core.py
+++ b/plato/core.py
@@ -539,7 +539,8 @@ def my_function(x):
     f will be an AutoCompilingFunction
     """
 
-    def __init__(self, fcn, cast_to_floatx = 'float', fixed_args = None, add_test_values = False, debug_print_shapes=False, resettable=False, **theano_function_kwargs):
+    def __init__(self, fcn, cast_to_floatx = 'float', fixed_args = None, add_test_values = False, debug_print_shapes=False,
+            resettable=False, print_initial_shapes = False, **theano_function_kwargs):
         """
         :param fcn: A symbolic function (decorated with one of the above decorators)
         :param cast_to_floatx: Case inputs  to the global float type (define this in ~/.theanorc).
@@ -577,6 +578,7 @@ def __init__(self, fcn, cast_to_floatx = 'float', fixed_args = None, add_test_va
         self._input_format = None
         self._output_format = None
         self.updated_variables = None  # Used in reset()
+        self.print_initial_shapes = print_initial_shapes
 
         # Create convenient debugging functions: showloc() and locinfo()
         __builtins__['showloc'] = show_all_locals
@@ -654,7 +656,20 @@ def __call__(self, *args, **kwargs):
                 flat_output_tensors = flat_output_tensors+traces.values()+self._original_fcn.locals().values()
 
             # Compile the theano function
-            PLATO_LOGGER.info('Compiling %s with %s inputs, %s outputs, %s updates' % (self._original_fcn.fcn_str(), len(args_and_kwarg_tensors), 1 if isinstance(outputs, Variable) else 0 if outputs is None else len(outputs), len(updates)))
+            if self.print_initial_shapes:
+                PLATO_LOGGER.info('Compiling {func_name} with: \n  {n_in} inputs: {in_shapes}\n  {n_out} outputs: {out_shapes}\n  {n_up} updates: {up_shapes}'.format(
+                    func_name = self._original_fcn.fcn_str(),
+                    n_in = len(args_and_kwarg_tensors),
+                    in_shapes = [f.shape if isinstance(f, np.ndarray) else () for f in flat_input_data],
+                    n_out = 1 if isinstance(outputs, Variable) else 0 if outputs is None else len(outputs),
+                    out_shapes = '???',
+                    n_up = len(updates),
+                    up_shapes = [p.get_value().shape for p, u in updates],
+                    ))
+            else:
+                PLATO_LOGGER.info('Compiling %s with %s inputs, %s outputs, %s updates' % (self._original_fcn.fcn_str(), len(args_and_kwarg_tensors), 1 if isinstance(outputs, Variable) else 0 if outputs is None else len(outputs), len(updates)))
+
+
             args_and_kwarg_tensors = [a for a in args_and_kwarg_tensors if not isinstance(a, SharedVariable)]  # Remove shared variables from passed-in tensor args
             if self.resettable:
                 self.updated_variables = [shared_var for shared_var, update in updates]
diff --git a/plato/tools/convnet/convnet.py b/plato/tools/convnet/convnet.py
index c2029fe..f5bdd6a 100644
--- a/plato/tools/convnet/convnet.py
+++ b/plato/tools/convnet/convnet.py
@@ -4,6 +4,7 @@
 import numpy as np
 import theano
 import theano.tensor as tt
+from plato.tools.misc.tdb_plotting import tdbplot
 from theano.tensor.nnet import conv3d2d
 
 from artemis.general.numpy_helpers import get_rng
@@ -87,21 +88,37 @@ def from_spec(cls, spec):
 @symbolic
 class ChannelwiseCrossCorr(object):
 
-    def __init__(self, border_mode='full', subsample=(1, 1)):
+    def __init__(self, border_mode='full', meansub=True, norm = False, subsample=(1, 1), eps=1e-7, flatten_channels=False):
         self.border_mode = border_mode
         self.subsample = subsample
+        self.meansub = meansub
+        self.norm = norm
+        self.eps = eps
+        self.flatten_channels = flatten_channels
 
     def __call__(self, (x1, x2)):
         """
         :param (x1, x2): are each (n_samples, n_channels, size_y, size_x) images
         :return: A (n_samples, n_channels, size_y*2-1, size_x*2-1) image representing the channelwise cross-correlation between
-            each pair of images.
+            each pair of images.  OR
+            (n_samples, size_y*2-1, size_x*2-1) if flatten=True
         """
         from theano.tensor.signal.conv import conv2d as sconv2d
-        x1_flat = x1.reshape((x1.shape[0]*x1.shape[1], x2.shape[2], x2.shape[3]))
+        if self.meansub:
+            x1 = x1 - x1.mean(axis=(1, 2, 3), keepdims=True)
+            x2 = x2 - x2.mean(axis=(1, 2, 3), keepdims=True)
+        x1_flat = x1.reshape((x1.shape[0]*x1.shape[1], x1.shape[2], x1.shape[3]))
         x2_flat = x2.reshape((x2.shape[0]*x2.shape[1], x2.shape[2], x2.shape[3]))[:, ::-1, ::-1]
         map_flat, _ = theano.scan(partial(sconv2d, border_mode=self.border_mode, subsample=self.subsample), sequences=[x1_flat, x2_flat])
         conv_maps = map_flat.reshape((x1.shape[0], x1.shape[1], map_flat.shape[1], map_flat.shape[2]))
+
+        if self.norm:
+            conv_maps = conv_maps / tt.sqrt((conv_maps**2).mean(axis=(1, 2, 3), keepdims=True) + self.eps)
+        # tdbplot(conv_maps[0, :4, :, :], 'corrmaps')
+
+        if self.flatten_channels:
+            conv_maps = conv_maps.mean(axis=1)
+
         return conv_maps
 
 @symbolic
diff --git a/plato/tools/convnet/test_convnet.py b/plato/tools/convnet/test_convnet.py
index 7ad1ca5..8e07f2f 100644
--- a/plato/tools/convnet/test_convnet.py
+++ b/plato/tools/convnet/test_convnet.py
@@ -96,6 +96,7 @@ def test_normalize_convnet():
 
 def test_cross_conv_layer():
 
+    # Part 1: Same size
     x_shift, y_shift = 3, -5
     rng = np.random.RandomState(1234)
     full_x = rng.randn(1, 10, 40, 40)
@@ -108,6 +109,15 @@ def test_cross_conv_layer():
     ixs = np.array([argmaxnd(y[0, i, :, :]) for i in xrange(10)])
     assert np.all(ixs-39//2 == (y_shift, x_shift))
 
+    # Part 2: Different sizes
+    x3 = full_x[:, :, 15+y_shift:25+y_shift, 15+x_shift:25+x_shift]  # Same center as before, just smaller
+    y = func((x1, x3))
+    assert y.shape==(1, 10, 20+10-1, 20+10-1)
+    ixs = np.array([argmaxnd(y[0, i, :, :]) for i in xrange(10)])
+    assert np.all(ixs-(20+10-1)//2 == (y_shift, x_shift))
+
+
+
 
 if __name__ == '__main__':
     # test_convnet_serialization()
diff --git a/plato/tools/mlp/manual_backprop_net.py b/plato/tools/mlp/manual_backprop_net.py
index 20ae160..c60771a 100644
--- a/plato/tools/mlp/manual_backprop_net.py
+++ b/plato/tools/mlp/manual_backprop_net.py
@@ -2,6 +2,7 @@
 from collections import OrderedDict
 
 import numpy as np
+from plato.tools.misc.tdb_plotting import tdbplot
 
 from artemis.general.nested_structures import get_leaf_values, NestedType
 from artemis.general.should_be_builtins import izip_equal
@@ -178,15 +179,15 @@ class SiameseNetwork(IManualBackpropLayer):
 
     """
 
-    def __init__(self, f_siamese, f_merge, f1 = IdentityLayer(), f2 = IdentityLayer()):
+    def __init__(self, f_siamese, f_merge, f1 = None, f2 = None):
         """
         :param f_siamese: A function or ManualBackpropLayer of the form f(
         :param f_merge:
         :return:
         """
         self.f_siamese = f_siamese
-        self.f1 = f1
-        self.f2 = f2
+        self.f1 = IdentityLayer() if f1 is None else f1
+        self.f2 = IdentityLayer() if f2 is None else f2
         self.f_merge = f_merge
 
     @symbolic
@@ -236,10 +237,41 @@ def parameters(self):
 class ConcatenationLayer(object):
 
     def __call__(self, (x1, x2)):
+
+        # tdbplot(x1[0, :9, :, :], 'x1')
+        # tdbplot(x2[0, :9, :, :], 'x2')
+
         return tt.concatenate([x1.flatten(2), x2.flatten(2)], axis=1)
 
 
 
+@symbolic
+class ChannelConcatenationLayer(object):
+
+    def __call__(self, (x1, x2)):
+
+        # tdbplot(x1[0, :9, :, :], 'x1')
+        # tdbplot(x2[0, :9, :, :], 'x2')
+
+        return tt.concatenate([x1, x2], axis=1)
+
+
+
+
+@symbolic
+class PlottingLayer(object):
+
+    def __init__(self, func=None, name='plot_var'):
+        self.func = func
+        self.name = name
+
+    def __call__(self, x):
+        from plato.tools.misc.tdb_plotting import tdbplot
+        plot_var = x if self.func is None else self.func(x)
+        tdbplot(plot_var, self.name)
+        return x
+
+
 class ExactBackpropLayer(IManualBackpropLayer):
     """
     Performs the function of a layer.

From d3c65f1c0753f7c0c4212bad7595b85d4031a71d Mon Sep 17 00:00:00 2001
From: Peter O'Connor <peter.ed.oconnor@gmail.com>
Date: Sun, 22 Oct 2017 22:07:47 +0200
Subject: [PATCH 21/29] oook

---
 plato/tools/convnet/conv_specifiers.py | 7 ++-----
 plato/tools/convnet/convnet.py         | 2 +-
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/plato/tools/convnet/conv_specifiers.py b/plato/tools/convnet/conv_specifiers.py
index c95636e..54365e7 100644
--- a/plato/tools/convnet/conv_specifiers.py
+++ b/plato/tools/convnet/conv_specifiers.py
@@ -1,7 +1,7 @@
 from artemis.fileman.primitive_specifiers import PrimativeSpecifier
 from artemis.general.numpy_helpers import get_rng
 from artemis.general.should_be_builtins import bad_value
-from artemis.ml.tools.neuralnets import initialize_weight_matrix
+from artemis.ml.tools.neuralnets import initialize_weight_matrix, initialize_conv_kernel
 import numpy as np
 __author__ = 'peter'
 
@@ -56,10 +56,7 @@ def __init__(self, w, b, mode):
     @staticmethod
     def from_init(k_shape, mode, mag='xavier', use_biases=True, rng=None):
         n_out_maps, n_in_maps, k_size_y, k_size_x = k_shape
-        rng = get_rng(rng)
-        if mag == 'xavier':
-            fanin, fanout = n_in_maps*k_size_x*k_size_y, n_out_maps*k_size_x*k_size_y
-            w = 1./np.sqrt(fanin+fanout) * rng.randn(*k_shape)
+        w = initialize_conv_kernel(kernel_shape=k_shape, mag=mag, rng=rng)
         b = np.zeros(n_out_maps) if use_biases else False
         return ConvolverSpec(w, b, mode)
 
diff --git a/plato/tools/convnet/convnet.py b/plato/tools/convnet/convnet.py
index f5bdd6a..be5dc00 100644
--- a/plato/tools/convnet/convnet.py
+++ b/plato/tools/convnet/convnet.py
@@ -297,7 +297,7 @@ def to_spec(self):
 
     @classmethod
     def from_spec(cls, spec):
-        if isinstance(spec, OrderedDict): # "old" format
+        if isinstance(spec, (list, tuple, OrderedDict)): # "old" format
             return ConvNet.from_init(spec)
         else:
             return ConvNet.from_init(spec.layer_ordered_dict)

From 544c794bf560f9c2f44e8cde1e2f5fbe0eb043f5 Mon Sep 17 00:00:00 2001
From: Peter O'Connor <peter.ed.oconnor@gmail.com>
Date: Mon, 23 Oct 2017 17:03:51 +0200
Subject: [PATCH 22/29] aahhh

---
 plato/core.py                          | 4 ++--
 plato/tools/mlp/manual_backprop_net.py | 7 ++++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/plato/core.py b/plato/core.py
index ef7d7e3..3a1aa1c 100644
--- a/plato/core.py
+++ b/plato/core.py
@@ -657,19 +657,19 @@ def __call__(self, *args, **kwargs):
 
             # Compile the theano function
             if self.print_initial_shapes:
-                PLATO_LOGGER.info('Compiling {func_name} with: \n  {n_in} inputs: {in_shapes}\n  {n_out} outputs: {out_shapes}\n  {n_up} updates: {up_shapes}'.format(
+                PLATO_LOGGER.info('Compiling {func_name} with: \n  {n_in} inputs: {in_shapes}\n  {n_out} outputs: {out_shapes}\n  {n_up} updates ({n_params} parameters): {up_shapes}'.format(
                     func_name = self._original_fcn.fcn_str(),
                     n_in = len(args_and_kwarg_tensors),
                     in_shapes = [f.shape if isinstance(f, np.ndarray) else () for f in flat_input_data],
                     n_out = 1 if isinstance(outputs, Variable) else 0 if outputs is None else len(outputs),
                     out_shapes = '???',
                     n_up = len(updates),
+                    n_params = sum(p.get_value().size for p, u in updates),
                     up_shapes = [p.get_value().shape for p, u in updates],
                     ))
             else:
                 PLATO_LOGGER.info('Compiling %s with %s inputs, %s outputs, %s updates' % (self._original_fcn.fcn_str(), len(args_and_kwarg_tensors), 1 if isinstance(outputs, Variable) else 0 if outputs is None else len(outputs), len(updates)))
 
-
             args_and_kwarg_tensors = [a for a in args_and_kwarg_tensors if not isinstance(a, SharedVariable)]  # Remove shared variables from passed-in tensor args
             if self.resettable:
                 self.updated_variables = [shared_var for shared_var, update in updates]
diff --git a/plato/tools/mlp/manual_backprop_net.py b/plato/tools/mlp/manual_backprop_net.py
index c60771a..1341e1a 100644
--- a/plato/tools/mlp/manual_backprop_net.py
+++ b/plato/tools/mlp/manual_backprop_net.py
@@ -20,7 +20,8 @@ class ManualBackpropNet(ISymbolicPredictor):
     """
     A sequential (chain) network where you can insert layers that do backprop manually.
     """
-    def __init__(self, layers, optimizer, loss, prediction_minibatch_size=None, pass_loss = True, params_to_train = None):
+    def __init__(self, layers, optimizer, loss, prediction_minibatch_size=None, pass_loss = True, params_to_train = None,
+                 return_prediction = False):
         """
         :param layrs:
         :param optimizer:
@@ -35,6 +36,7 @@ def __init__(self, layers, optimizer, loss, prediction_minibatch_size=None, pass
         self.loss = get_named_cost_function(loss) if isinstance(loss, basestring) else loss
         self.prediction_minibatch_size = prediction_minibatch_size
         self.params_to_train = params_to_train
+        self.return_prediction = return_prediction
 
     @symbolic
     def predict(self, x):
@@ -71,6 +73,9 @@ def train(self, x, y):
                 params, grads = zip(*layer_pairs)
                 optimizer.update_from_gradients(parameters=params, gradients=grads)
 
+        if self.return_prediction:
+            return out
+
     @property
     def parameters(self):
         return self.model.parameters

From 22b26058d8794e422b1f571a51e567813fdc1c41 Mon Sep 17 00:00:00 2001
From: Peter O'Connor <peter.ed.oconnor@gmail.com>
Date: Tue, 24 Oct 2017 15:07:35 +0200
Subject: [PATCH 23/29] fdsfds

---
 plato/tools/convnet/conv_specifiers.py | 12 ++++++++++++
 plato/tools/convnet/convnet.py         |  3 +++
 2 files changed, 15 insertions(+)

diff --git a/plato/tools/convnet/conv_specifiers.py b/plato/tools/convnet/conv_specifiers.py
index 54365e7..519410f 100644
--- a/plato/tools/convnet/conv_specifiers.py
+++ b/plato/tools/convnet/conv_specifiers.py
@@ -121,6 +121,12 @@ def shape_transfer(self, input_shape):
             n_samples, input_dims = input_shape
             return n_samples, self.w.shape[1]
 
+    @classmethod
+    def from_init(cls, n_in, n_out, mag = 'xavier', rng=None):
+        w = initialize_weight_matrix(n_in, n_out, mag=mag, rng=rng)
+        b = np.zeros(n_out)
+        return FullyConnectedSpec(w=w, b=b)
+
 
 class ConvNetSpec(PrimativeSpecifier):
 
@@ -131,6 +137,12 @@ def shape_transfer(self):
         raise NotImplementedError()
 
 
+def compute_feature_shape(input_shape, specs):
+
+    shape = input_shape
+    for spec in specs:
+        shape = spec.shape_transfer(shape)
 
+    return shape
 
 # class ConvNetSpec
\ No newline at end of file
diff --git a/plato/tools/convnet/convnet.py b/plato/tools/convnet/convnet.py
index be5dc00..3a56562 100644
--- a/plato/tools/convnet/convnet.py
+++ b/plato/tools/convnet/convnet.py
@@ -107,6 +107,9 @@ def __call__(self, (x1, x2)):
         if self.meansub:
             x1 = x1 - x1.mean(axis=(1, 2, 3), keepdims=True)
             x2 = x2 - x2.mean(axis=(1, 2, 3), keepdims=True)
+
+
+
         x1_flat = x1.reshape((x1.shape[0]*x1.shape[1], x1.shape[2], x1.shape[3]))
         x2_flat = x2.reshape((x2.shape[0]*x2.shape[1], x2.shape[2], x2.shape[3]))[:, ::-1, ::-1]
         map_flat, _ = theano.scan(partial(sconv2d, border_mode=self.border_mode, subsample=self.subsample), sequences=[x1_flat, x2_flat])

From 6bff49de237aad1c01d1814076816278aa503049 Mon Sep 17 00:00:00 2001
From: Peter O'Connor <peter.ed.oconnor@gmail.com>
Date: Wed, 25 Oct 2017 17:49:33 +0200
Subject: [PATCH 24/29] dsffds

---
 plato/tools/mlp/manual_backprop_net.py | 36 +++++++++++++++++++++-----
 plato/tools/optimization/cost.py       |  1 +
 2 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/plato/tools/mlp/manual_backprop_net.py b/plato/tools/mlp/manual_backprop_net.py
index 1341e1a..8340a0c 100644
--- a/plato/tools/mlp/manual_backprop_net.py
+++ b/plato/tools/mlp/manual_backprop_net.py
@@ -6,7 +6,7 @@
 
 from artemis.general.nested_structures import get_leaf_values, NestedType
 from artemis.general.should_be_builtins import izip_equal
-from plato.core import create_constant, symbolic
+from plato.core import create_constant, symbolic, create_shared_variable, add_update
 from plato.interfaces.helpers import batchify_function, get_named_activation_function, get_parameters_or_not
 from plato.interfaces.interfaces import IParameterized
 from plato.tools.common.online_predictors import ISymbolicPredictor
@@ -46,7 +46,8 @@ def predict(self, x):
             return batchify_function(self._predict_in_single_pass, batch_size=self.prediction_minibatch_size)(x)
 
     def _predict_in_single_pass(self, x):
-        out, _ = self.model.forward_pass_and_state(x)
+        # out, _ = self.model.forward_pass_and_state(x)
+        out, _ = forward_pass_and_state(self.model, x)
         return out
 
     @symbolic
@@ -138,31 +139,54 @@ def backward_pass(layer, state, grad, loss):
     return grad_inputs, param_grad_pairs
 
 
+SNEAKILY_SAVE_ACTIVATIONS = False
+
+
+def set_sneakily_save_activations(state):
+    global SNEAKILY_SAVE_ACTIVATIONS
+    SNEAKILY_SAVE_ACTIVATIONS = state
+
 class ChainNetwork(IManualBackpropLayer):
 
-    def __init__(self, layers):
+    def __init__(self, layers, sneakily_save_activations = False):
         if isinstance(layers, OrderedDict):
             self.layer_names, self.layers = zip(*layers.items())
         else:
             self.layer_names = range(len(layers))
             self.layers = layers
 
+        self.sneakily_saved_activations = OrderedDict()
+        self.sneakily_saved_gradients = OrderedDict()
+
     @symbolic
     def forward_pass_and_state(self, x):
+        if SNEAKILY_SAVE_ACTIVATIONS:
+            self.sneakily_saved_activations['input'] = create_shared_variable(np.zeros((1,) * x.ndim))
+            add_update(self.sneakily_saved_activations['input'], x)
         state = {}
-        for layer in self.layers:
+        for layer_name, layer in zip(self.layer_names, self.layers):
             x, layer_state = forward_pass_and_state(layer, x)
             state[layer]=layer_state
+            if SNEAKILY_SAVE_ACTIVATIONS:
+                self.sneakily_saved_activations[layer_name] = create_shared_variable(np.zeros((1,) * x.ndim))
+                add_update(self.sneakily_saved_activations[layer_name], x)
         return x, state
 
     @symbolic
     def backward_pass(self, state, grad, loss):
-        assert (grad is None) != (loss is None), 'Gove me a grad xor give me a loss.'
+        assert (grad is None) != (loss is None), 'Give me a grad xor give me a loss.'
         param_grad_pairs = []
-        for layer in self.layers[::-1]:
+        # if SNEAKILY_SAVE_ACTIVATIONS:
+        #     self.sneakily_saved_gradients['output'] = create_shared_variable(np.zeros((1,) * grad.ndim))
+        #     add_update(self.sneakily_saved_activations['input'], grad)
+
+        for layer_name, layer in zip(self.layer_names[::-1], self.layers[::-1]):
             grad, layer_param_grad_pairs = backward_pass(layer, state[layer], grad, loss)
             loss = None
             param_grad_pairs += layer_param_grad_pairs
+            if SNEAKILY_SAVE_ACTIVATIONS:
+                self.sneakily_saved_gradients[layer_name] = create_shared_variable(np.zeros((1,) * grad.ndim))
+                add_update(self.sneakily_saved_gradients[layer_name], grad)
         return grad, param_grad_pairs
 
     @property
diff --git a/plato/tools/optimization/cost.py b/plato/tools/optimization/cost.py
index 32a5eac..11a7e38 100644
--- a/plato/tools/optimization/cost.py
+++ b/plato/tools/optimization/cost.py
@@ -188,6 +188,7 @@ def l1_norm_error(actual, target, eps = 1e-7):
         'onehot-mse': onehot_mse,
         'norm_l1_error': l1_norm_error,
         'softmax-xe': softmax_xe,
+        'softmax_xe': softmax_xe,
         'categorical-xe': categorical_xe,
         'logistic-xe': logistic_xe,
         }

From d9fed1bc783e1a10396de0b23baac27920757792 Mon Sep 17 00:00:00 2001
From: Peter O'Connor <peter.ed.oconnor@gmail.com>
Date: Thu, 26 Oct 2017 13:58:28 +0200
Subject: [PATCH 25/29] fdsf

---
 plato/tools/mlp/manual_backprop_net.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/plato/tools/mlp/manual_backprop_net.py b/plato/tools/mlp/manual_backprop_net.py
index 8340a0c..025c04b 100644
--- a/plato/tools/mlp/manual_backprop_net.py
+++ b/plato/tools/mlp/manual_backprop_net.py
@@ -146,9 +146,10 @@ def set_sneakily_save_activations(state):
     global SNEAKILY_SAVE_ACTIVATIONS
     SNEAKILY_SAVE_ACTIVATIONS = state
 
+
 class ChainNetwork(IManualBackpropLayer):
 
-    def __init__(self, layers, sneakily_save_activations = False):
+    def __init__(self, layers, backprop_down_to=None, sneakily_save_activations = False):
         if isinstance(layers, OrderedDict):
             self.layer_names, self.layers = zip(*layers.items())
         else:
@@ -157,6 +158,7 @@ def __init__(self, layers, sneakily_save_activations = False):
 
         self.sneakily_saved_activations = OrderedDict()
         self.sneakily_saved_gradients = OrderedDict()
+        self.backprop_down_to = backprop_down_to
 
     @symbolic
     def forward_pass_and_state(self, x):
@@ -187,6 +189,8 @@ def backward_pass(self, state, grad, loss):
             if SNEAKILY_SAVE_ACTIVATIONS:
                 self.sneakily_saved_gradients[layer_name] = create_shared_variable(np.zeros((1,) * grad.ndim))
                 add_update(self.sneakily_saved_gradients[layer_name], grad)
+            if self.backprop_down_to is not None and layer_name==self.backprop_down_to:
+                break
         return grad, param_grad_pairs
 
     @property

From cb6c7097b2447e6dd4fed77aa2dbc72e894a1bdd Mon Sep 17 00:00:00 2001
From: Peter O'Connor <peter.ed.oconnor@gmail.com>
Date: Mon, 30 Jul 2018 14:58:18 +0200
Subject: [PATCH 26/29] upgraded for python 3

---
 plato/core.py                             | 13 ++++++++-----
 plato/examples/demo_prediction_example.py |  4 ++--
 plato/interfaces/helpers.py               |  1 +
 plato/tools/mlp/demo_mnist_mlp.py         | 10 +++++-----
 plato/tools/mlp/mlp.py                    |  2 +-
 plato/tools/va/demo_gaussian_vae.py       |  2 +-
 6 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/plato/core.py b/plato/core.py
index 3a1aa1c..6b49f59 100644
--- a/plato/core.py
+++ b/plato/core.py
@@ -389,6 +389,9 @@ def __eq__(self, other):
                 return True
         return False
 
+    # def __hash__(self):
+    #     return hash(self.fcn)
+
     def to_format(self, format_decorator):
 
         @format_decorator
@@ -420,7 +423,7 @@ def __get__(self, instance, other):
         # no reason to create a separate object every time we want to get the method, and (b) debugging - because we
         # attach the local variables to the method, and want to get them later, so the returned method better have
         # the same address every time we request it.
-        if instance in self._dispatched_methods:
+        if instance in tuple(self._dispatched_methods.keys()):
             return self._dispatched_methods[instance]
         else:
             return _SymbolicFunctionWrapper(self.fcn, input_format=self.input_format, output_format=self.output_format, update_format=self.update_format, attached_instance=instance)
@@ -848,10 +851,10 @@ def flattenit(var, ndim):
 
 def show_all_locals():
     locals_of_calling_frame = inspect.currentframe().f_back.f_locals
-    print '=== Locals ==='
+    print('=== Locals ===')
     for k, v_info in get_local_info(locals_of_calling_frame).iteritems():
-        print '%s = %s' % (k, v_info)
-    print '--------------'
+        print('%s = %s' % (k, v_info))
+    print('--------------')
 
 
 def get_local_info(locals_of_calling_frame=None):
@@ -920,7 +923,7 @@ def find_leaf_ancestors(variable):
 
 
 def printit(var_name, var_val):
-    print '%s: %s' % (var_name, var_val)
+    print('%s: %s' % (var_name, var_val))
 
 
 name_counts = {}
diff --git a/plato/examples/demo_prediction_example.py b/plato/examples/demo_prediction_example.py
index 0d10dca..9654bae 100644
--- a/plato/examples/demo_prediction_example.py
+++ b/plato/examples/demo_prediction_example.py
@@ -22,7 +22,7 @@ def compare_example_predictors(
         minibatch_size = 10,
     ):
     """
-    This demo shows how we can compare different online predictors.  The demo trains both predictors on the dataset,
+    This demo shows how we can compare_learning_curves different online predictors.  The demo trains both predictors on the dataset,
     returning an object that contains the results.
 
     :param test_mode: Set this to True to just run the demo quicky (but not to completion) to see that it doesn't break.
@@ -37,7 +37,7 @@ def compare_example_predictors(
         n_epochs = 1
         n_tests = 3
 
-    # Here we compare three predictors on MNIST - an MLP, a Perceptron, and a Random Forest.
+    # Here we compare_learning_curves three predictors on MNIST - an MLP, a Perceptron, and a Random Forest.
     # - The MLP is defined using Plato's interfaces - we create a Symbolic Predictor (GradientBasedPredictor) and
     #   then compile it into an IPredictor object
     # - The Perceptron directly implements the IPredictor interface.
diff --git a/plato/interfaces/helpers.py b/plato/interfaces/helpers.py
index bfd7e94..fa4a58a 100644
--- a/plato/interfaces/helpers.py
+++ b/plato/interfaces/helpers.py
@@ -100,6 +100,7 @@ def identity(x):
     'softmax': softmax,
     'sigm': tt.nnet.sigmoid,
     'sig': tt.nnet.sigmoid,
+    'clip': lambda x: tt.clip(x, 0, 1),
     'd_sigm': lambda x: tt.nnet.sigmoid(x)-tt.nnet.sigmoid(-x),
     'tanh': tt.tanh,
     'sech2': lambda x: (4*tt.cosh(x)**2)/(tt.cosh(2*x)+1)**2,
diff --git a/plato/tools/mlp/demo_mnist_mlp.py b/plato/tools/mlp/demo_mnist_mlp.py
index 6686771..d8a7fb9 100644
--- a/plato/tools/mlp/demo_mnist_mlp.py
+++ b/plato/tools/mlp/demo_mnist_mlp.py
@@ -1,8 +1,8 @@
-from artemis.experiments.experiment_record import experiment_function
+from artemis.experiments.decorators import experiment_function
 from artemis.experiments.ui import browse_experiments
 from artemis.general.test_mode import is_test_mode
 from artemis.ml.datasets.mnist import get_mnist_dataset
-from artemis.ml.predictors.train_and_test import train_and_test_online_predictor
+from artemis.ml.predictors.deprecated.train_and_test_old import train_and_test_online_predictor
 from artemis.plotting.db_plotting import dbplot, hold_dbplots
 from plato.tools.common.online_predictors import GradientBasedPredictor
 from plato.tools.mlp.mlp import MultiLayerPerceptron
@@ -87,7 +87,7 @@ def vis_callback(info, score):
 demo_mnist_mlp.add_variant('deep', hidden_sizes=[500, 500, 500, 500])
 
 # demo_mnist_mlp.get_variant('deep').run()
-print demo_mnist_mlp.get_variant('deep').get_latest_record().get_log()
+# print demo_mnist_mlp.get_variant('deep').get_latest_record().get_log()
 
 
 # X=demo_mnist_mlp.add_variant('mini-mnist', max_training_samples=1000, max_test_samples=1000, hidden_sizes=[100], n_epochs=100, visualize_params=True)
@@ -99,6 +99,6 @@ def vis_callback(info, score):
 # demo_mnist_mlp.add_variant(hidden_sizes=[])
 
 
-# if __name__ == '__main__':
+if __name__ == '__main__':
 
-    # browse_experiments()
+    browse_experiments()
diff --git a/plato/tools/mlp/mlp.py b/plato/tools/mlp/mlp.py
index 2b86baa..52e947b 100644
--- a/plato/tools/mlp/mlp.py
+++ b/plato/tools/mlp/mlp.py
@@ -92,7 +92,7 @@ def from_weights(cls, weights, biases = None, hidden_activations ='sig', output_
                 nonlinearity=nonlinearity
             )
             for w, b, nonlinearity, layer_no in
-                izip_equal(weights, [False]*len(weights) if biases is False else [0.]*len(weights) if biases in (True, None) else biases, [hidden_activations] * (n_layers - 1) + [output_activation], xrange(n_layers))
+                izip_equal(weights, [False]*len(weights) if biases is False else [0.]*len(weights) if biases in (True, None) else biases, [hidden_activations] * (n_layers - 1) + [output_activation], range(n_layers))
                 ]
         return cls(layers)
 
diff --git a/plato/tools/va/demo_gaussian_vae.py b/plato/tools/va/demo_gaussian_vae.py
index acc612f..b192003 100644
--- a/plato/tools/va/demo_gaussian_vae.py
+++ b/plato/tools/va/demo_gaussian_vae.py
@@ -1,5 +1,5 @@
 import numpy as np
-from artemis.experiments.experiment_record import experiment_function
+from artemis.experiments import experiment_function
 from artemis.experiments.ui import browse_experiments
 from artemis.general.test_mode import is_test_mode
 from artemis.ml.datasets.mnist import get_mnist_dataset

From 4e416dde45b8fa82b272169f9fabb44cf3a4253b Mon Sep 17 00:00:00 2001
From: Peter O'Connor <peter.ed.oconnor@gmail.com>
Date: Tue, 31 Jul 2018 18:02:12 +0200
Subject: [PATCH 27/29] made optimizers more functional

---
 plato/core.py                                 |  10 +-
 .../optimization/demo_compare_optimizers.py   |   5 +-
 plato/tools/optimization/optimizers.py        | 218 ++++++++++++------
 plato/tools/optimization/test_optimizers.py   |  42 +++-
 4 files changed, 192 insertions(+), 83 deletions(-)

diff --git a/plato/core.py b/plato/core.py
index 6b49f59..2b5c49b 100644
--- a/plato/core.py
+++ b/plato/core.py
@@ -115,7 +115,7 @@ class NoUpdatesFormat(IFormat):
 
     @staticmethod
     def check(data, f):
-        assert isinstance(data, list), "Updates should be in the form of a list.  Something is strange if this is not the case"
+        assert isinstance(data, list), "Updates should be in the form of a list.  Something is strange if this is not the case.  Got {}".format(data)
         if len(data)!=0:
             raise SymbolicFormatError("Function %s should have created no state updates, but it created updates: %s" % (f, data))
 
@@ -406,7 +406,7 @@ def partial(self, **fixed_kwargs):
         """
         Partially define the input arguments and return a new symbolic function.
         """
-        fixed_kwargs = {k: (tt.constant(v) if isinstance(v, np.ndarray) else v) for k, v in fixed_kwargs.iteritems()}  # This prevents
+        fixed_kwargs = {k: (tt.constant(v) if isinstance(v, np.ndarray) else v) for k, v in fixed_kwargs.items()}  # This prevents
         return _SymbolicFunctionWrapper(fcn=partial(self.fcn, **fixed_kwargs), input_format = PassAnythingFormat,
             output_format=self.output_format, update_format=self.update_format, attached_instance=self.attached_instance)
 
@@ -507,7 +507,7 @@ def _is_tuple_of_tuples_of_tensors(args):
 def _is_named_collection(arg):
     if not isinstance(arg, dict):
         return False
-    if not all(isinstance(k, (basestring, int)) for k in arg.keys()):
+    if not all(isinstance(k, (str, int)) for k in arg.keys()):
         return False
     if not all(_is_tensor(v) for v in arg.values()):
         return False
@@ -1179,7 +1179,7 @@ def add_update(self, shared_var, new_val, accumulate = None):
             self._outer_catcher.add_update(shared_var, new_val)
 
     def get_updates(self, as_dict = False):
-        return OrderedDict(self._updates.items()) if as_dict else self._updates.items()
+        return OrderedDict(self._updates.items()) if as_dict else list(self._updates.items())
 
 
 StateCatcher = CaptureUpdates  # Backwards compatibility
@@ -1317,7 +1317,7 @@ def create_shared_variable_from_zeros(shape, name = None, **shared_kwargs):
     :param shared_kwargs: Other keyword args for shared variable construction
     :return: A theano shared variable.
     """
-    assert name is None or isinstance(name, basestring)  # Mostly checks that you didn't accidentally call like create_shared_variable_from_zeros(3, 4)
+    assert name is None or isinstance(name, str)  # Mostly checks that you didn't accidentally call like create_shared_variable_from_zeros(3, 4)
     return create_shared_variable(initializer_fcn=np.zeros(shape), name=name, **shared_kwargs)
 
 
diff --git a/plato/tools/optimization/demo_compare_optimizers.py b/plato/tools/optimization/demo_compare_optimizers.py
index 4836c09..9d69a1d 100644
--- a/plato/tools/optimization/demo_compare_optimizers.py
+++ b/plato/tools/optimization/demo_compare_optimizers.py
@@ -1,10 +1,9 @@
-from artemis.experiments.experiment_record import run_experiment
 from artemis.general.mymath import sqrtspace
 from artemis.general.test_mode import is_test_mode, set_test_mode
 from artemis.ml.datasets.mnist import get_mnist_dataset
 from artemis.ml.predictors.learning_curve_plots import plot_learning_curves
 from artemis.ml.predictors.predictor_comparison import compare_predictors
-from artemis.ml.predictors.train_and_test import percent_argmax_correct
+from artemis.ml.tools.costs import percent_argmax_correct
 from artemis.ml.tools.processors import OneHotEncoding
 from artemis.plotting.pyplot_plus import set_default_figure_size
 from plato.tools.common.online_predictors import GradientBasedPredictor
@@ -110,7 +109,7 @@ def backprop_vs_difference_target_prop(
         ):
 
     dataset = get_mnist_dataset(flat = True)
-    dataset = dataset.process_with(targets_processor=lambda (x, ): (OneHotEncoding(10)(x).astype(int), ))
+    dataset = dataset.process_with(targets_processor=lambda x_s: (OneHotEncoding(10)(x_s[0]).astype(int), ))
 
     if is_test_mode():
         dataset = dataset.shorten(200)
diff --git a/plato/tools/optimization/optimizers.py b/plato/tools/optimization/optimizers.py
index 43b6052..c6ef42b 100644
--- a/plato/tools/optimization/optimizers.py
+++ b/plato/tools/optimization/optimizers.py
@@ -1,5 +1,8 @@
 from abc import abstractmethod
-from plato.core import add_update, create_shared_variable, StateCatcher, tdbprint, CaptureUpdates
+
+from theano.ifelse import ifelse
+
+from plato.core import add_update, create_shared_variable, StateCatcher, tdbprint, CaptureUpdates, symbolic_stateless
 from plato.interfaces.decorators import symbolic_updater
 import theano.tensor as tt
 import theano
@@ -19,11 +22,25 @@ def __call__(self, cost, parameters):
         """
 
     @abstractmethod
-    def get_updates(self, cost, parameters, constants = []):
+    def get_updates(self, cost, parameters, constants = ()):
+        """
+        :param Scalar cost:
+        :param Sequence[Variable] parameters:
+        :param Sequence[Variable] constants:
+        :return Sequence[Tuple[Tensor, Tensor]]: Pairs of (variable, new_variable)
+        """
         pass
 
     @abstractmethod
-    def update_parameters(self, cost, parameters, constants=[]):
+    def get_updates_from_gradients(self, parameters, gradients):
+        """
+        :param Sequence[Tensor] parameters:
+        :param Sequence[Tensor] gradients:
+        :return Sequence[Tuple[Tensor, Tensor]]:
+        """
+
+    @abstractmethod
+    def update_parameters(self, cost, parameters, constants=()):
         pass
 
     @abstractmethod
@@ -44,14 +61,13 @@ def __call__(self, cost, parameters, constants = []):
         """
         self.update_parameters(cost=cost, parameters=parameters, constants=constants)
 
-    def get_updates(self, cost, parameters, constants = [], as_dict = False):
+    def get_updates(self, cost, parameters, constants = [], clip=None):
         """
         Get the gradient-based parameter updates, but do not apply them.
         return: A list of (shared_var, new_val) pairs representing the updates.
         """
-        with CaptureUpdates(swallow=True) as sc:
-            self(cost=cost, parameters=parameters, constants=constants)
-        return sc.get_updates(as_dict=as_dict)
+        gradients = theano.grad(cost, parameters, consider_constant = constants)
+        return self.get_updates_from_gradients(parameters=parameters, gradients=gradients, clip=clip)
 
     def update_parameters(self, cost, parameters, constants = []):
         """
@@ -69,20 +85,39 @@ def update_from_gradients(self, parameters, gradients, clip = None):
         :param gradients: A list of corresponding gradients
         :param clip: Optionally, a 2-tuple indicating the range in which to clip parameters, (or
         """
+        updates = self.get_updates_from_gradients(parameters=parameters, gradients=gradients, clip=clip)
+        for p, v in updates:
+            add_update(p, v)
+
+    @symbolic_stateless
+    def get_updates_from_gradients(self, parameters, gradients, clip=None):
+        """
+        :param Sequence[Variable] parameters: The list of symbolic parameters
+        :param Sequence[Variable] gradients: The list of gradients
+        :param Optional[Union[float, Tuple[float,float]] clip: The clipping parameter
+        :return Sequence[Tuple[Variable, Variable]]: The list of updates (the first len(parameters) of which are ordered parameter updates - the rest are for optimizer params).
+        """
         if clip is not None and not isinstance(clip, (list, tuple)):
             clip = (-clip, clip)
         assert len(parameters)==len(gradients), 'Lenght of parameter vector must match length of gradients.'
+        parameter_updates_list = []
+        optimizer_updates_list = []
         for p, g in zip(parameters, gradients):
-            if clip is None:
-                self._update_param(p, g)
-            else:
-                with CaptureUpdates(swallow=True) as sc:
-                    self._update_param(p, g)
-                sc.get_updates()
+            updates = self._get_updates_for_param(p, g)
+            param_update = updates[0] if clip is None else (updates[0][0], tt.clip(updates[0][1], *clip))
+            parameter_updates_list.append(param_update)
+            optimizer_updates_list += updates[1:]
+        all_updates = parameter_updates_list + optimizer_updates_list
+        return all_updates
 
     @abstractmethod
-    def _update_param(self, param, gradient):
-        pass
+    def _get_updates_for_param(self, param, gradient):
+        """
+        A stateless method
+        :param Variable param: The parameter
+        :param Variable gradient: The gradient of this parameter
+        :return Sequence[Tuple[Variable, Variable]]: The updates - the first of which is the parameter updates (others may update optimizer state)
+        """
 
 
 class GradientStepUpdater(UniformParameterOptimizer):
@@ -90,8 +125,9 @@ class GradientStepUpdater(UniformParameterOptimizer):
     Just subtract the gradient to the parameter.  This is mainly useful in some situations the step size doesn't matter
     (because for instance, the function is invariant to the scale of the weights)
     """
-    def _update_param(self, param, gradient):
-        add_update(param, param - gradient)
+    def _get_updates_for_param(self, param, gradient):
+        return [(param, param-gradient)]
+        # add_update(param, param - gradient)
 
 
 class SimpleGradientDescent(UniformParameterOptimizer):
@@ -106,8 +142,19 @@ def __init__(self, eta):
         """
         self._eta = eta
 
-    def _update_param(self, param, gradient):
-        add_update(param, param - self._eta * gradient)
+    def _get_updates_for_param(self, param, gradient):
+        return [(param, param - self._eta * gradient)]
+        # add_update(param, param - self._eta * gradient)
+
+
+def create_optimizer_param_like(param, name=None):
+    """
+    :param TensorVariable like: A variable which it is "like"
+    :return Tuple[TensorSharedVariable, Scalar]: The variable and a scalar boolean tensor that can be used in an ifelse to check if its been initialized.
+    """
+    opt_param = theano.shared(np.zeros([0]*param.ndim, dtype=param.dtype), name=name)
+    initialized = opt_param.size>0
+    return opt_param, initialized
 
 
 class LangevinGradientDescent(UniformParameterOptimizer):
@@ -123,8 +170,9 @@ def __init__(self, eta, rng = None):
         self._eta = eta
         self._rng = get_theano_rng(rng)
 
-    def _update_param(self, param, gradient):
-        add_update(param, param - self._eta*gradient + 2*tt.sqrt(self._eta)*self._rng.normal(size = param.ishape))
+    def _get_updates_for_param(self, param, gradient):
+        # add_update(param, param - self._eta*gradient + 2*tt.sqrt(self._eta)*self._rng.normal(size = param.ishape))
+        return[(param, param - self._eta*gradient + 2*tt.sqrt(self._eta)*self._rng.normal(size = param.ishape))]
 
 
 class Adam(UniformParameterOptimizer):
@@ -146,25 +194,31 @@ def __init__(self, alpha = 1e-3, beta_1=0.1, beta_2=0.001, eps = 1e-8):
         self.beta_2 = beta_2
         self.eps = eps
 
-    def _update_param(self, param, gradient):
+    def _get_updates_for_param(self, param, gradient):
         # Initialize variables
         i = create_shared_variable(0.)
-        m = theano.shared(param.get_value() * 0.)
-        v = theano.shared(param.get_value() * 0.)
+        # m = theano.shared(param.get_value() * 0.)
+        # v = theano.shared(param.get_value() * 0.)
+
+        m, initialized = create_optimizer_param_like(param)
+        v, _ = create_optimizer_param_like(param)
+        # v = theano.shared(param.ndim * 0.)
 
         # Recompute values
         i_t = i + 1.
         fix1 = 1. - (1. - self.beta_1)**i_t
         fix2 = 1. - (1. - self.beta_2)**i_t
         lr_t = self.alpha * (tt.sqrt(fix2) / fix1)
-        m_t = (self.beta_1 * gradient) + ((1. - self.beta_1) * m)
-        v_t = (self.beta_2 * tt.sqr(gradient)) + ((1. - self.beta_2) * v)
+        m_t = ifelse(initialized, self.beta_1 * gradient + (1. - self.beta_1) * m, self.beta_1 * gradient)
+        v_t = ifelse(initialized, self.beta_2 * tt.sqr(gradient) + (1. - self.beta_2) * v, self.beta_2 * tt.sqr(gradient))
         g_t = m_t / (tt.sqrt(v_t) + self.eps)
         p_t = param - (lr_t * g_t)
-        add_update(param, p_t)
-        add_update(m, m_t)
-        add_update(v, v_t)
-        add_update(i, i_t)
+        return [(param, p_t), (m, m_t), (v, v_t), (i, i_t)]
+
+        # add_update(param, p_t)
+        # add_update(m, m_t)
+        # add_update(v, v_t)
+        # add_update(i, i_t)
 
 
 class AdaMax(UniformParameterOptimizer):
@@ -175,15 +229,21 @@ def __init__(self, alpha = 1e-3, beta_1=0.1, beta_2=0.001, eps = 1e-8):
         self._beta_2 = beta_2
         self._eps = eps
 
-    def _update_param(self, param, gradient):
-        mom1 = theano.shared(np.zeros_like(param.get_value()))
-        mom2 = theano.shared(np.zeros_like(param.get_value()))
-        mom1_new = mom1 + self._beta_1 * (gradient - mom1)
-        mom2_new = tt.maximum(abs(gradient) + self._eps, (1. - self._beta_2) * mom2)
+    def _get_updates_for_param(self, param, gradient):
+
+        mom1, initialized = create_optimizer_param_like(param)
+        mom2, _ = create_optimizer_param_like(param)
+
+        # mom1 = theano.shared(np.zeros_like(param.get_value()))
+        # mom2 = theano.shared(np.zeros_like(param.get_value()))
+        mom1_new = ifelse(initialized, mom1 + self._beta_1 * (gradient - mom1), self._beta_1*gradient)
+        mom2_new = ifelse(initialized, tt.maximum(abs(gradient) + self._eps, (1. - self._beta_2) * mom2), abs(gradient) + self._eps)
         new_param = param - self._alpha * mom1_new / mom2_new
-        add_update(param, new_param)
-        add_update(mom1, mom1_new)
-        add_update(mom2, mom2_new)
+        return [(param, new_param), (mom1, mom1_new), (mom2, mom2_new)]
+
+        # add_update(param, new_param)
+        # add_update(mom1, mom1_new)
+        # add_update(mom2, mom2_new)
 
 
 class RMSProp(UniformParameterOptimizer):
@@ -193,12 +253,16 @@ def __init__(self, learning_rate = 0.1, decay = 0.9, max_scaling = 1e5):
         self.epsilon = 1./max_scaling
         self.learning_rate = learning_rate
 
-    def _update_param(self, param, gradient):
-        mean_squared_grad = theano.shared(np.zeros_like(param.get_value()))
-        new_mean_squared_grad = self.decay * mean_squared_grad + (1-self.decay) * gradient**2
+    def _get_updates_for_param(self, param, gradient):
+        # mean_squared_grad = theano.shared(np.zeros_like(param.get_value()))
+        mean_squared_grad, initialized = create_optimizer_param_like(param)
+
+        new_mean_squared_grad = ifelse(initialized, self.decay * mean_squared_grad + (1-self.decay) * gradient**2, (1-self.decay) * gradient**2)
         delta_p = - self.learning_rate * gradient / tt.maximum(tt.sqrt(new_mean_squared_grad), self.epsilon)
-        add_update(param, param + delta_p)
-        add_update(mean_squared_grad, new_mean_squared_grad)
+
+        return [(param, param + delta_p), (mean_squared_grad, new_mean_squared_grad)]
+        # add_update(param, param + delta_p)
+        # add_update(mean_squared_grad, new_mean_squared_grad)
 
 
 class AdaGrad(UniformParameterOptimizer):
@@ -216,12 +280,16 @@ def __init__(self, learning_rate = 0.01, decay_rate = 0, max_scaling = 1e5):
         self.learning_rate = learning_rate
         self.decay_rate = decay_rate
 
-    def _update_param(self, param, gradient):
-        sum_squared_grad = theano.shared(param.get_value()*0)
-        new_ssg = (1-self.decay_rate)*sum_squared_grad + gradient**2
+    def _get_updates_for_param(self, param, gradient):
+        # sum_squared_grad = theano.shared(param.get_value()*0)
+
+        sum_squared_grad, initialized = create_optimizer_param_like(param)
+
+        new_ssg = ifelse(initialized, (1-self.decay_rate)*sum_squared_grad + gradient**2, gradient**2)
         scale = tt.maximum(self.eps, tt.sqrt(new_ssg))
-        add_update(param, param - (self.learning_rate / scale) * gradient)
-        add_update(sum_squared_grad, new_ssg)
+        return [(param, param - (self.learning_rate / scale) * gradient), (sum_squared_grad, new_ssg)]
+        # add_update(param, param - (self.learning_rate / scale) * gradient)
+        # add_update(sum_squared_grad, new_ssg)
 
 
 class GradientDescent(UniformParameterOptimizer):
@@ -235,16 +303,23 @@ def __init__(self, eta, momentum = 0, decay = 0):
         self.momentum = momentum
         self.decay = decay
 
-    def _update_param(self, param, gradient):
+    def _get_updates_for_param(self, param, gradient):
+
+        updates = []
 
         if self.momentum != 0:
-            mom = theano.shared(np.zeros_like(param.get_value()))
-            new_mom = self.momentum * mom + gradient
-            add_update(mom, new_mom)
+            mom, initialized = create_optimizer_param_like(param)
+            # mom = theano.shared(np.zeros_like(param.get_value()))
+            new_mom = ifelse(initialized, self.momentum * mom + gradient, gradient)
+            # add_update(mom, new_mom)
+            updates.append((mom, new_mom))
             direction = new_mom  # Or mom, something about Nesterov...
         else:
             direction = gradient
-        add_update(param, param - self.eta*direction - self.decay*param)
+
+        updates.insert(0, (param, param - self.eta*direction - self.decay*param))
+        return updates
+        # add_update(param, param - self.eta*direction - self.decay*param)
 
 
 class MultiplicativeGradientDescent(UniformParameterOptimizer):
@@ -252,9 +327,10 @@ class MultiplicativeGradientDescent(UniformParameterOptimizer):
     def __init__(self, factor = 0.01):
         self.factor = factor
 
-    def _update_param(self, param, gradient):
+    def _get_updates_for_param(self, param, gradient):
         multiplier = tt.exp(-tt.tanh(gradient)*self.factor)
-        add_update(param, param*multiplier)
+        return [(param, param*multiplier)]
+        # add_update(param, param*multiplier)
 
 
 class PIDOptimizer(UniformParameterOptimizer):
@@ -268,35 +344,41 @@ def __init__(self, kp=0.1, ki=0, kd=0):
         self.ki = ki
         self.kd = kd
 
-    def _update_param(self, param, gradient):
+    def _get_updates_for_param(self, param, gradient):
+
+        updates = []
         new_param = param
         if self.kp != 0:
             new_param -= self.kp * gradient
         if self.ki != 0:
             grad_integral = create_shared_variable(np.zeros_like(param.get_value()))
             new_gradient_integral = grad_integral + grad_integral
-            add_update(grad_integral, new_gradient_integral)
+            # add_update(grad_integral, new_gradient_integral)
+            updates.append((grad_integral, new_gradient_integral))
             new_param -= self.ki * new_gradient_integral
         if self.kd != 0:
             grad_last = create_shared_variable(np.zeros_like(param.get_value()))
-            add_update(grad_last, gradient)
+            # add_update(grad_last, gradient)
+            updates.append((grad_last, gradient))
             new_param -= self.kd * (gradient - grad_last)
-        add_update(param, new_param)
+        # add_update(param, new_param)
+        updates.insert(0, (param, new_param))
+        return updates
 
 
-def get_named_optimizer(name, learning_rate, rng = None):
+def get_named_optimizer(name, learning_rate, rng = None, **kwargs):
     """
     Convenience function for easily specifying optimizers.
     :param name: The name of the optimizer
     :param learning_rate: A scalar, representing the parameter that's most equivalent to a learning rate.
-    :return: An IGradientOptimizer object.
+    :return IGradientOptimizer: The optimizer object.
     """
     return {
-        'sgd': lambda: SimpleGradientDescent(eta = learning_rate),
-        'adam': lambda: Adam(alpha=learning_rate),
-        'adamax': lambda: AdaMax(alpha=learning_rate),
-        'rmsprop': lambda: RMSProp(learning_rate=learning_rate),
-        'adagrad': lambda: AdaGrad(learning_rate=learning_rate),
-        'mulsgd': lambda: MultiplicativeGradientDescent(factor=learning_rate),
-        'langevin': lambda: LangevinGradientDescent(eta = learning_rate, rng = rng),
+        'sgd': lambda: GradientDescent(eta = learning_rate, **kwargs),
+        'adam': lambda: Adam(alpha=learning_rate, **kwargs),
+        'adamax': lambda: AdaMax(alpha=learning_rate, **kwargs),
+        'rmsprop': lambda: RMSProp(learning_rate=learning_rate, **kwargs),
+        'adagrad': lambda: AdaGrad(learning_rate=learning_rate, **kwargs),
+        'mulsgd': lambda: MultiplicativeGradientDescent(factor=learning_rate, **kwargs),
+        'langevin': lambda: LangevinGradientDescent(eta = learning_rate, rng = rng, **kwargs),
     }[name]()
diff --git a/plato/tools/optimization/test_optimizers.py b/plato/tools/optimization/test_optimizers.py
index 2344355..8e3cdea 100644
--- a/plato/tools/optimization/test_optimizers.py
+++ b/plato/tools/optimization/test_optimizers.py
@@ -1,7 +1,10 @@
+from plato.core import symbolic, add_update
 from plato.tools.optimization.demo_compare_optimizers import get_experiments
-from plato.tools.optimization.optimizers import GradientDescent, Adam, AdaMax
+from plato.tools.optimization.optimizers import GradientDescent, Adam, AdaMax, RMSProp, get_named_optimizer
 from plato.tools.regressors.online_regressor import OnlineRegressor
 from artemis.ml.predictors.predictor_tests import assert_online_predictor_not_broken
+import theano.tensor as tt
+import numpy as np
 
 
 def _test_optimizer_on_simple_classification_problem(optimizer):
@@ -32,14 +35,39 @@ def test_adamax_optimizer():
     _test_optimizer_on_simple_classification_problem(AdaMax(alpha=0.01))
 
 
-if __name__ == '__main__':
-    test_gradient_descent_optimizer()
-    test_adam_optimizer()
-    test_adamax_optimizer()
+def test_unknown_shape():
+
+    @symbolic
+    def func(x, optimizer):
+        loss = tt.sum((x-3)**2)
+        updates = optimizer.get_updates(cost=loss, parameters=[x])
+        for p, v in updates[1:]:
+            add_update(p, v)
+        return updates[0][1]
+
+    x_base = np.random.RandomState(1234).randn(3, 4)
+    for opt in ('adam', 'adamax', 'adagrad', 'rmsprop'):
+        print('Running Optimizer: {}'.format(opt))
+        optimizer = get_named_optimizer(opt, learning_rate=0.5)
+        x = x_base
+        f = func.partial(optimizer = optimizer).compile()
+        for _ in range(50):
+            x = f(x)
+        error = np.abs(x-3)
+        print('Mean Error: {}'.format(error.mean()))
+        assert np.all(np.abs(x-3)<1.)
 
 
 def test_demo_compare_optimizers():
 
     for exp_name, exp in get_experiments().iteritems():
-        print 'Running %s' % exp_name
-        exp()
\ No newline at end of file
+        print('Running %s' % exp_name)
+        exp()
+
+
+if __name__ == '__main__':
+    # test_gradient_descent_optimizer()
+    # test_adam_optimizer()
+    # test_adamax_optimizer()
+    test_unknown_shape()
+

From 11659f8a864e73cd5249c707d0e87e6e33f1eddf Mon Sep 17 00:00:00 2001
From: Peter O'Connor <peter.ed.oconnor@gmail.com>
Date: Tue, 31 Jul 2018 18:06:33 +0200
Subject: [PATCH 28/29] avoid nose import

---
 plato/interfaces/helpers.py | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/plato/interfaces/helpers.py b/plato/interfaces/helpers.py
index fa4a58a..2bb18ec 100644
--- a/plato/interfaces/helpers.py
+++ b/plato/interfaces/helpers.py
@@ -1,16 +1,15 @@
 import numpy as np
-from plato.core import symbolic_simple, add_update, create_shared_variable, symbolic, CaptureUpdates
-from plato.interfaces.interfaces import IParameterized
 import theano
-from theano.compile.sharedvalue import SharedVariable
+import theano.tensor as tt
+from theano.gof.graph import Variable
 from theano.ifelse import ifelse
-from theano.sandbox.cuda.rng_curand import CURAND_RandomStreams
 from theano.sandbox.rng_mrg import MRG_RandomStreams
 from theano.tensor.shared_randomstreams import RandomStreams
-import theano.tensor as tt
 from theano.tensor.sharedvar import TensorSharedVariable
 from theano.tensor.var import TensorVariable
-from theano.gof.graph import Variable
+
+from plato.core import symbolic_simple, add_update, create_shared_variable, symbolic
+from plato.interfaces.interfaces import IParameterized
 
 __author__ = 'peter'
 
@@ -58,13 +57,17 @@ def get_theano_rng(seed, rngtype = 'mrg'):
     :return:
     """
 
+    def load_cuda_rng():
+        from theano.sandbox.cuda.rng_curand import CURAND_RandomStreams
+        return CURAND_RandomStreams
+
     stream_types = {
-        'mrg': MRG_RandomStreams_ext,
-        'mrg-old': MRG_RandomStreams,
-        'default': RandomStreams,
-        'cuda': CURAND_RandomStreams
+        'mrg': lambda: MRG_RandomStreams_ext,
+        'mrg-old': lambda: MRG_RandomStreams,
+        'default': lambda: RandomStreams,
+        'cuda': load_cuda_rng
     }
-    rng_con = stream_types[rngtype]
+    rng_con = stream_types[rngtype]()
 
     if isinstance(seed, np.random.RandomState):
         return rng_con(seed.randint(1e9))
@@ -72,7 +75,7 @@ def get_theano_rng(seed, rngtype = 'mrg'):
         return rng_con(seed)
     elif seed is None:
         return rng_con(np.random.randint(1e9))
-    elif isinstance(seed, tuple(stream_types.values())):
+    elif isinstance(seed, tuple(v() for v in stream_types.values())):
         return seed
     else:
         raise Exception("Can't initialize a random number generator with %s" % (seed, ))

From 5355d5f528ae52e8eb8ad51e90a3bd58b2fcb116 Mon Sep 17 00:00:00 2001
From: Peter O'Connor <peter.ed.oconnor@gmail.com>
Date: Tue, 7 Aug 2018 14:50:09 +0200
Subject: [PATCH 29/29] allow nones to be passed in and out of funcitons as
 special values

---
 plato/core.py      | 23 ++++++++++++++++++-----
 plato/test_core.py | 16 +++++++++++++++-
 2 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/plato/core.py b/plato/core.py
index 2b5c49b..11a6959 100644
--- a/plato/core.py
+++ b/plato/core.py
@@ -582,6 +582,7 @@ def __init__(self, fcn, cast_to_floatx = 'float', fixed_args = None, add_test_va
         self._output_format = None
         self.updated_variables = None  # Used in reset()
         self.print_initial_shapes = print_initial_shapes
+        self._none_output_indices = None  # Indices of outputs of the funcition that are "None"... These are considered special and are passed straingt through
 
         # Create convenient debugging functions: showloc() and locinfo()
         __builtins__['showloc'] = show_all_locals
@@ -616,7 +617,7 @@ def __call__(self, *args, **kwargs):
             # Find tensor versions of inputs based on data in first-call, collect list of inputs
             self._input_format = NestedType.from_data(input_data)
             flat_input_data = self._input_format.get_leaves(input_data)
-            args_and_kwarg_tensors = [_data_to_tensor(d, cast_to_floatx = self._cast_to_floatx, add_test_value = True if self._add_test_values else 'shape') for d in flat_input_data]
+            args_and_kwarg_tensors = [_data_to_tensor(d, cast_to_floatx = self._cast_to_floatx, add_test_value = True if self._add_test_values else 'shape') if d is not None else None for d in flat_input_data]
             self._shared_var_inputs = [trace_value for trace_value in args_and_kwarg_tensors if isinstance(trace_value, SharedVariable)]
             tensor_args, tensor_kwargs = self._input_format.expand_from_leaves(args_and_kwarg_tensors, check_types=False)  # Because types will be different
 
@@ -656,7 +657,7 @@ def __call__(self, *args, **kwargs):
                 self._local_variable_keys = self._original_fcn.locals().keys()
                 self._n_outputs = len(flat_output_tensors)
                 self._n_trace_vars = len(traces)
-                flat_output_tensors = flat_output_tensors+traces.values()+self._original_fcn.locals().values()
+                flat_output_tensors = flat_output_tensors+list(traces.values())+list(self._original_fcn.locals().values())
 
             # Compile the theano function
             if self.print_initial_shapes:
@@ -673,11 +674,18 @@ def __call__(self, *args, **kwargs):
             else:
                 PLATO_LOGGER.info('Compiling %s with %s inputs, %s outputs, %s updates' % (self._original_fcn.fcn_str(), len(args_and_kwarg_tensors), 1 if isinstance(outputs, Variable) else 0 if outputs is None else len(outputs), len(updates)))
 
-            args_and_kwarg_tensors = [a for a in args_and_kwarg_tensors if not isinstance(a, SharedVariable)]  # Remove shared variables from passed-in tensor args
+            args_and_kwarg_tensors = [a for a in args_and_kwarg_tensors if not isinstance(a, SharedVariable) and a is not None]  # Remove shared variables from passed-in tensor args
             if self.resettable:
                 self.updated_variables = [shared_var for shared_var, update in updates]
                 self._original_variable_values = [var.get_value() for var in self.updated_variables]
-            self._compiled_fcn = theano.function(inputs = args_and_kwarg_tensors, outputs = flat_output_tensors, updates = updates, allow_input_downcast=self._cast_to_floatx, **self.theano_function_kwargs)
+
+            if None in flat_output_tensors:
+                flat_non_none_output_tensors = list(x for x in flat_output_tensors if x is not None)
+                self._none_output_indices = [o is None for o in flat_output_tensors]
+            else:
+                flat_non_none_output_tensors = flat_output_tensors
+                self._none_output_indices = None
+            self._compiled_fcn = theano.function(inputs = args_and_kwarg_tensors, outputs = flat_non_none_output_tensors, updates = updates, allow_input_downcast=self._cast_to_floatx, **self.theano_function_kwargs)
             PLATO_LOGGER.info('Done.')
 
         # Ok, so this code runs every time you call the "compiled" function.
@@ -690,7 +698,7 @@ def __call__(self, *args, **kwargs):
             "The shared variables you passed in, {}, Don't match the shared variables you passed in when you first called this compiled function: {}. " \
             "This creates problems for us.  Instead, compile your function a second time for the new shared inputs."\
             .format(['{}@{}'.format(repr(trace_value), hex(id(trace_value))) for trace_value in shared_passed_in], ['{}@{}'.format(repr(trace_value), hex(id(trace_value))) for trace_value in self._shared_var_inputs])
-        arg_and_kwarg_values = [a for a in arg_and_kwarg_values if not isinstance(a, SharedVariable)]  # Remove shared variables from passed-in numeric args
+        arg_and_kwarg_values = [a for a in arg_and_kwarg_values if not isinstance(a, SharedVariable) and a is not None]  # Remove shared variables from passed-in numeric args
 
         # Now, run the actual numeric function!
         if self._there_are_debug_variables:  # Need to take care of stripping off the debug variables
@@ -704,6 +712,11 @@ def __call__(self, *args, **kwargs):
             self._local_values = {k: v for k, v in zip(self._local_variable_keys, local_out)}
         else:  # Normal case
             flat_output_data = all_out = self._compiled_fcn(*arg_and_kwarg_values)
+
+        if self._none_output_indices is not None:
+            flat_output_iter = iter(flat_output_data)
+            flat_output_data = list(next(flat_output_iter) if not isnone else None for isnone in self._none_output_indices)
+
         true_out = self._output_format.expand_from_leaves(flat_output_data, check_types=False) if len(flat_output_data)>0 else ()
 
         if self._debug_print_shapes:
diff --git a/plato/test_core.py b/plato/test_core.py
index 098f502..d6da9d3 100644
--- a/plato/test_core.py
+++ b/plato/test_core.py
@@ -637,6 +637,19 @@ def do_something_internal(a, b):
     assert np.array_equal(state.get_value(), np.arange(6).dot(np.arange(1, 7)))
 
 
+def test_none_inputs_and_outputs():
+
+    @symbolic
+    def double_if_not_none(params):
+        return [p*2 if p is not None else None for p in params]
+
+    f = double_if_not_none.compile()
+    assert f([1, 2, 3, None, 4]) == [2, 4, 6, None, 8]
+    assert f([1, 2, 3, None, 4]) == [2, 4, 6, None, 8]
+
+    with pytest.raises(TypeError):  # Warns that you're not calling in consistent way.
+        f([1, 2, 3, 3, 4])
+
 
 if __name__ == '__main__':
     test_ival_ishape()
@@ -662,4 +675,5 @@ def do_something_internal(a, b):
     test_function_reset()
     test_trace_var_in_scan()
     test_easy_scan_syntax()
-    test_scan_no_return()
\ No newline at end of file
+    test_scan_no_return()
+    test_none_inputs_and_outputs()