FFNet.h
Go to the documentation of this file.
1 /*!
2  *
3  *
4  * \brief Implements a Feef-Forward multilayer perceptron
5  *
6  *
7  *
8  * \author O. Krause
9  * \date 2010-2014
10  *
11  *
12  * \par Copyright 1995-2015 Shark Development Team
13  *
14  * <BR><HR>
15  * This file is part of Shark.
16  * <http://image.diku.dk/shark/>
17  *
18  * Shark is free software: you can redistribute it and/or modify
19  * it under the terms of the GNU Lesser General Public License as published
20  * by the Free Software Foundation, either version 3 of the License, or
21  * (at your option) any later version.
22  *
23  * Shark is distributed in the hope that it will be useful,
24  * but WITHOUT ANY WARRANTY; without even the implied warranty of
25  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26  * GNU Lesser General Public License for more details.
27  *
28  * You should have received a copy of the GNU Lesser General Public License
29  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
30  *
31  */
32 #ifndef SHARK_MODELS_FFNET_H
33 #define SHARK_MODELS_FFNET_H
34 
36 #include <shark/Models/Neurons.h>
37 #include <boost/serialization/vector.hpp>
38 
39 namespace shark{
40 
43  Normal, //< Layerwise connectivity without shortcuts
44  InputOutputShortcut, //< Normal with additional shortcuts from input to output neuron
45  Full //< Every layer is fully connected to all neurons in the lower layer
46  };
47 };
48 
49 //! \brief Offers the functions to create and to work with a feed-forward network.
50 //!
51 //! A feed forward network consists of several layers. every layer consists of a linear
52 //! function with optional bias whose response is modified by a (nonlinear) activation function.
53 //! starting from the input layer, the output of every layer is the input of the next.
54 //! The two template arguments goveern the activation functions of the network.
55 //! The activation functions are typically sigmoidal.
56 //! All hidden layers share one activation function, while the output layer can be chosen to use
57 //! a different one, for example to allow the last output to be unbounded, in which case a
58 //! linear output function is used.
59 //! It is not possible to use arbitrary activation functions but Neurons following in the structure
60 //! in Models/Neurons.h Especially it holds that the derivative of the activation function
61 //! must have the form f'(x) = g(f(x)).
62 //!
63 //! This network class allows for several different topologies of structure. The layerwise structure
64 //! outlined above is the ddefault one, but the network also allows for shortcuts. most typically
65 //! an input-output shotcut is used, that is a shortcut that connects the input neurons directly
66 //! with the output using linear weights. But also a fully connected structure is possible, where
67 //! every layer is fed as input to every successive layer instead of only the next one.
68 template<class HiddenNeuron,class OutputNeuron>
69 class FFNet :public AbstractModel<RealVector,RealVector>
70 {
71  struct InternalState: public State{
72  //! \brief Used to store the current results of the activation
73  //! function for all neurons for the last batch of patterns \f$x\f$.
74  //!
75  //! There is one value for input+hidden+output units for every element of the batch.
76  //! For every value, the following holds:
77  //! Given a network with \f$M\f$ neurons, including
78  //! \f$c\f$ input and \f$n\f$ output neurons the single
79  //! values for \f$z\f$ are given as:
80  //! <ul>
81  //! <li>\f$z_i = x_i,\ \mbox{for\ } 0 \leq i < c\f$</li>
82  //! <li>\f$z_i = g_{hidden}(x),\ \mbox{for\ } c \leq i < M - n\f$</li>
83  //! <li>\f$z_i = y_{i-M+n} = g_{output}(x),\ \mbox{for\ } M - n \leq
84  //! i < M\f$</li>
85  //! </ul>
86  RealMatrix responses;
87 
88  void resize(std::size_t neurons, std::size_t patterns){
89  responses.resize(neurons,patterns);
90  }
91  };
92 
93 
94 public:
95 
96  //! Creates an empty feed-forward network. After the constructor is called,
97  //! one version of the #setStructure methods needs to be called
98  //! to define the network topology.
100  :m_numberOfNeurons(0),m_inputNeurons(0),m_outputNeurons(0){
101  m_features|=HAS_FIRST_PARAMETER_DERIVATIVE;
102  m_features|=HAS_FIRST_INPUT_DERIVATIVE;
103  }
104 
105  //! \brief From INameable: return the class name.
106  std::string name() const
107  { return "FFNet"; }
108 
109  //! \brief Number of input neurons.
110  std::size_t inputSize()const{
111  return m_inputNeurons;
112  }
113  //! \brief Number of output neurons.
114  std::size_t outputSize()const{
115  return m_outputNeurons;
116  }
117  //! \brief Total number of neurons, that is inputs+hidden+outputs.
118  std::size_t numberOfNeurons()const{
119  return m_numberOfNeurons;
120  }
121  //! \brief Total number of hidden neurons.
122  std::size_t numberOfHiddenNeurons()const{
123  return numberOfNeurons() - inputSize() -outputSize();
124  }
125 
126  //! \brief Returns the matrices for every layer used by eval.
127  std::vector<RealMatrix> const& layerMatrices()const{
128  return m_layerMatrix;
129  }
130 
131  //! \brief Returns the weight matrix of the i-th layer.
132  RealMatrix const& layerMatrix(std::size_t layer)const{
133  return m_layerMatrix[layer];
134  }
135 
136  void setLayer(std::size_t layerNumber, RealMatrix const& m, RealVector const& bias){
137  SIZE_CHECK(m.size1() == bias.size());
138  SIZE_CHECK(m.size1() == m_layerMatrix[layerNumber].size1());
139  SIZE_CHECK(m.size2() == m_layerMatrix[layerNumber].size2());
140  m_layerMatrix[layerNumber] = m;
141  std::size_t start = 0;
142  for(std::size_t i = 0; i != layerNumber; ++i){
143  start += m_layerMatrix[i].size1();
144  }
145  noalias(subrange(m_bias,start,start+bias.size())) = bias;
146  //set backprop matrices
147  setParameterVector(parameterVector());
148  }
149 
150  //! \brief Returns the matrices for every layer used by backpropagation.
151  std::vector<RealMatrix> const& backpropMatrices()const{
152  return m_backpropMatrix;
153  }
154 
155  //! \brief Returns the direct shortcuts between input and output neurons.
156  //!
157  //! This does not necessarily exist.
158  RealMatrix const& inputOutputShortcut() const{
159  return m_inputOutputShortcut;
160  }
161 
162  /// \brief Returns the activation function of the hidden units.
163  HiddenNeuron const& hiddenActivationFunction()const{
164  return m_hiddenNeuron;
165  }
166  /// \brief Returns the activation function of the output units.
167  OutputNeuron const& outputActivationFunction()const{
168  return m_outputNeuron;
169  }
170 
171  /// \brief Returns the activation function of the hidden units.
172  HiddenNeuron& hiddenActivationFunction(){
173  return m_hiddenNeuron;
174  }
175  /// \brief Returns the activation function of the output units.
176  OutputNeuron& outputActivationFunction(){
177  return m_outputNeuron;
178  }
179 
180  //! \brief Returns the bias values for hidden and output units.
181  //!
182  //! This is either empty or a vector of size numberOfNeurons()-inputSize().
183  //! the first entry is the value of the first hidden unit while the last outputSize() units
184  //! are the values of the output units.
185  const RealVector& bias()const{
186  return m_bias;
187  }
188 
189  ///\brief Returns the portion of the bias vector of the i-th layer.
190  RealVector bias(std::size_t layer)const{
191  std::size_t start = 0;
192  for(std::size_t i = 0; i != layer; ++i){
193  start +=layerMatrices()[i].size1();
194  }
195  return subrange(m_bias,start,start+layerMatrices()[layer].size1());
196  }
197 
198  //! \brief Returns the total number of parameters of the network.
199  std::size_t numberOfParameters()const{
200  std::size_t numParams = m_inputOutputShortcut.size1()*m_inputOutputShortcut.size2();
201  numParams += bias().size();
202  for(std::size_t i = 0; i != layerMatrices().size(); ++i){
203  numParams += layerMatrices()[i].size1()*layerMatrices()[i].size2();
204  }
205  return numParams;
206  }
207 
208  //! returns the vector of used parameters inside the weight matrix
209  RealVector parameterVector() const{
210  RealVector parameters(numberOfParameters());
211  init(parameters) << matrixSet(m_layerMatrix),m_bias,toVector(m_inputOutputShortcut);
212  return parameters;
213  }
214  //! uses the values inside the parametervector to set the used values inside the weight matrix
215  void setParameterVector(RealVector const& newParameters){
216  //set the normal forward propagation weights
217  init(newParameters) >> matrixSet(m_layerMatrix),m_bias,toVector(m_inputOutputShortcut);
218 
219  //we also have to update the backpropagation weights
220  //this is more or less an inversion. for all connections of a neuron i with a neuron j, i->j
221  //the backpropagation matrix has an entry j->i.
222 
223  // we start with all neurons in layer i, looking at all layers j > i and checking whether
224  // they are connected, in this case we transpose the part of the matrix which is connecting
225  // layer j with layer i and copying it into the backprop matrix.
226  // we assume here, that either all neurons in layer j are connected to all neurons in layer i
227  // or that there are no connections at all beetween the layers.
228  std::size_t layeriStart = 0;
229  for(std::size_t layeri = 0; layeri != m_layerMatrix.size(); ++layeri){
230  std::size_t columni = 0;
231  std::size_t neuronsi = inputSize();
232  if(layeri > 0)
233  neuronsi = m_layerMatrix[layeri-1].size1();
234 
235  std::size_t layerjStart = layeriStart + neuronsi;
236  for(std::size_t layerj = layeri; layerj != m_layerMatrix.size(); ++layerj){
237  std::size_t neuronsj = m_layerMatrix[layerj].size1();
238  //only process, if layer j has connections with layer i
239  if(layerjStart-m_layerMatrix[layerj].size2() <= layeriStart){
240 
241  //Start of the weight columns to layer i in layer j.
242  //parantheses are important to protect against underflow
243  std::size_t weightStartj = layeriStart -(layerjStart - m_layerMatrix[layerj].size2());
244  noalias(columns(m_backpropMatrix[layeri],columni,columni+neuronsj))
245  = trans(columns(m_layerMatrix[layerj],weightStartj,weightStartj+neuronsi));
246  }
247  columni += neuronsj;
248  layerjStart += neuronsj;
249  }
250  layeriStart += neuronsi;
251  }
252  }
253 
254  //! \brief Returns the output of all neurons after the last call of eval
255  //!
256  //! \param state last result of eval
257  //! \return Output value of the neurons.
258  RealMatrix const& neuronResponses(State const& state)const{
259  InternalState const& s = state.toState<InternalState>();
260  return s.responses;
261  }
262 
263  boost::shared_ptr<State> createState()const{
264  return boost::shared_ptr<State>(new InternalState());
265  }
266 
267  ///\brief Returns the response of the i-th layer given the input of that layer.
268  ///
269  /// this is usfull if only a portion of the network needs to be evaluated
270  /// be aware that this only works without shortcuts in the network
271  void evalLayer(std::size_t layer,RealMatrix const& patterns,RealMatrix& outputs)const{
272  std::size_t numPatterns = patterns.size1();
273  std::size_t numOutputs = m_layerMatrix[layer].size1();
274  outputs.resize(numPatterns,numOutputs);
275  outputs.clear();
276 
277  //calculate activation. first compute the linear part and the optional bias and then apply
278  // the non-linearity
279  noalias(outputs) = prod(patterns,trans(layerMatrix(layer)));
280  if(!bias().empty()){
281  noalias(outputs) += repeat(bias(layer),numPatterns);
282  }
283  // if this is the last layer, use output neuron response
284  if(layer < m_layerMatrix.size()-1) {
285  noalias(outputs) = m_hiddenNeuron(outputs);
286  }
287  else {
288  noalias(outputs) = m_outputNeuron(outputs);
289  }
290  }
291 
292  ///\brief Returns the response of the i-th layer given the input of that layer.
293  ///
294  /// this is usfull if only a portion of the network needs to be evaluated
295  /// be aware that this only works without shortcuts in the network
296  Data<RealVector> evalLayer(std::size_t layer, Data<RealVector> const& patterns)const{
297  int batches = (int) patterns.numberOfBatches();
298  Data<RealVector> result(batches);
299  SHARK_PARALLEL_FOR(int i = 0; i < batches; ++i){
300  evalLayer(layer,patterns.batch(i),result.batch(i));
301  }
302  return result;
303  }
304 
305  void eval(RealMatrix const& patterns,RealMatrix& output, State& state)const{
306  InternalState& s = state.toState<InternalState>();
307  std::size_t numPatterns = patterns.size1();
308  //initialize the input layer using the patterns.
309  s.resize(numberOfNeurons(),numPatterns);
310  s.responses.clear();
311  noalias(rows(s.responses,0,m_inputNeurons)) = trans(patterns);
312  std::size_t beginNeuron = m_inputNeurons;
313 
314  for(std::size_t layer = 0; layer != m_layerMatrix.size();++layer){
315  const RealMatrix& weights = m_layerMatrix[layer];
316  //number of rows of the layer is also the number of neurons
317  std::size_t endNeuron = beginNeuron + weights.size1();
318  //some subranges of vectors
319  //inputs are the last n neurons, where n is the number of columns of the matrix
320  RealSubMatrix const input = rows(s.responses,beginNeuron - weights.size2(),beginNeuron);
321  //the neurons responses
322  RealSubMatrix responses = rows(s.responses,beginNeuron,endNeuron);
323 
324  //calculate activation. first compute the linear part and the optional bias and then apply
325  // the non-linearity
326  noalias(responses) = prod(weights,input);
327  if(!bias().empty()){
328  //the bias of the layer is shifted as input units can not have bias.
329  ConstRealVectorRange bias = subrange(m_bias,beginNeuron-inputSize(),endNeuron-inputSize());
330  noalias(responses) += trans(repeat(bias,numPatterns));
331  }
332  SHARK_CRITICAL_REGION{//beware Dropout Neurons!
333  // if this is the last layer, use output neuron response instead
334  if(layer < m_layerMatrix.size()-1) {
335  noalias(responses) = m_hiddenNeuron(responses);
336  }
337  else {
338  //add shortcuts if necessary
339  if(m_inputOutputShortcut.size1() != 0){
340  noalias(responses) += prod(m_inputOutputShortcut,trans(patterns));
341  }
342  noalias(responses) = m_outputNeuron(responses);
343  }
344  }
345  //go to the next layer
346  beginNeuron = endNeuron;
347  }
348  //Sanity check
349  SIZE_CHECK(beginNeuron == m_numberOfNeurons);
350 
351  //copy output layer into output
352  output.resize(numPatterns,m_outputNeurons);
353  noalias(output) = trans(rows(s.responses,m_numberOfNeurons-outputSize(),m_numberOfNeurons));
354  }
356 
358  BatchInputType const& patterns, RealMatrix const& coefficients, State const& state, RealVector& gradient
359  )const{
360  SIZE_CHECK(coefficients.size2() == m_outputNeurons);
361  SIZE_CHECK(coefficients.size1() == patterns.size1());
362  std::size_t numPatterns=patterns.size1();
363 
364  //initialize delta using coefficients and clear the rest. also don't compute the delta for
365  // the input nurons as they are not needed.
366  RealMatrix delta(numberOfNeurons(),numPatterns,0.0);
367  RealSubMatrix outputDelta = rows(delta,delta.size1()-outputSize(),delta.size1());
368  noalias(outputDelta) = trans(coefficients);
369 
370  computeDelta(delta,state,false);
371  computeParameterDerivative(delta,state,gradient);
372 
373  }
374 
376  BatchInputType const& patterns, RealMatrix const& coefficients, State const& state, BatchInputType& inputDerivative
377  )const{
378  SIZE_CHECK(coefficients.size2() == m_outputNeurons);
379  SIZE_CHECK(coefficients.size1() == patterns.size1());
380  std::size_t numPatterns=patterns.size1();
381 
382  //initialize delta using coefficients and clear the rest
383  //we compute the full set of delta values here. the delta values of the inputs are the inputDerivative
384  RealMatrix delta(numberOfNeurons(),numPatterns,0.0);
385  RealSubMatrix outputDelta = rows(delta,delta.size1()-outputSize(),delta.size1());
386  noalias(outputDelta) = trans(coefficients);
387 
388  computeDelta(delta,state,true);
389  inputDerivative.resize(numPatterns,inputSize());
390  noalias(inputDerivative) = trans(rows(delta,0,inputSize()));
391  }
392 
393  virtual void weightedDerivatives(
394  BatchInputType const & patterns,
395  BatchOutputType const & coefficients,
396  State const& state,
397  RealVector& parameterDerivative,
398  BatchInputType& inputDerivative
399  )const{
400  SIZE_CHECK(coefficients.size2() == m_outputNeurons);
401  SIZE_CHECK(coefficients.size1() == patterns.size1());
402  std::size_t numPatterns = patterns.size1();
403 
404 
405  //compute full delta and thus the input derivative
406  RealMatrix delta(numberOfNeurons(),numPatterns,0.0);
407  RealSubMatrix outputDelta = rows(delta,delta.size1()-outputSize(),delta.size1());
408  noalias(outputDelta) = trans(coefficients);
409 
410  computeDelta(delta,state,true);
411  inputDerivative.resize(numPatterns,inputSize());
412  noalias(inputDerivative) = trans(rows(delta,0,inputSize()));
413 
414  //reuse delta to compute the parameter derivative
415  computeParameterDerivative(delta,state,parameterDerivative);
416  }
417 
418  //! \brief Calculates the derivative for the special case, when error terms for all neurons of the network exist.
419  //!
420  //! This is usefull when the hidden neurons need to meet additional requirements.
421  //! The value of delta is changed during computation and holds the results of the backpropagation steps.
422  //! The format is such that the rows of delta are the neurons and the columns the patterns.
424  RealMatrix const& patterns, RealMatrix& delta, State const& state, RealVector& gradient
425  )const{
426  InternalState const& s = state.toState<InternalState>();
427  SIZE_CHECK(delta.size1() == m_numberOfNeurons);
428  SIZE_CHECK(delta.size2() == patterns.size1());
429  SIZE_CHECK(s.responses.size2() == patterns.size1());
430 
431  computeDelta(delta,state,false);
432  //now compute the parameter derivative from the delta values
433  computeParameterDerivative(delta,state,gradient);
434  }
435 
436  //! \brief Creates a connection matrix for a network.
437  //!
438  //! Automatically creates a network with several layers, with
439  //! the numbers of neurons for each layer defined by \em layers.
440  //! layers must be at least size 2, which will result in a network with no hidden layers.
441  //! the first and last values correspond to the number of inputs and outputs respectively.
442  //!
443  //! The network supports three different tpes of connection models:
444  //! FFNetStructures::Normal corresponds to a layerwise connection between consecutive
445  //! layers. FFNetStructures::InputOutputShortcut additionally adds a shortcut between
446  //! input and output neurons. FFNetStructures::Full connects every layer to every following
447  //! layer, this includes also the shortcuts for input and output neurons. Additionally
448  //! a bias term an be used.
449  //!
450  //! While Normal and Full only use the layer matrices, inputOutputShortcut also uses
451  //! the corresponding matrix variable (be aware that in the case of only one hidden layer,
452  //! the shortcut between input and output leads to the same network as the Full - in that case
453  //! the Full topology is chosen for optimization reasons)
454  //!
455  //! \param layers contains the numbers of neurons for each layer of the network.
456  //! \param connectivity type of connection used between layers
457  //! \param biasNeuron if set to \em true, connections from
458  //! all neurons (except the input neurons)
459  //! to the bias will be set.
461  std::vector<size_t> const& layers,
463  bool biasNeuron = true
464  ){
465  SIZE_CHECK(layers.size() >= 2);
466  m_layerMatrix.resize(layers.size()-1);//we don't model the input layer
467  m_backpropMatrix.resize(layers.size()-1);//we don't model the output layer
468 
469  //small optimization for ntworks with only 3 layers
470  //in this case, we don't need an explicit shortcut as we can integrate it into
471  //the big matrices
472  if(connectivity == FFNetStructures::InputOutputShortcut && layers.size() ==3)
473  connectivity = FFNetStructures::Full;
474 
475 
476  m_inputNeurons = layers.front();
477  m_outputNeurons = layers.back();
478  m_numberOfNeurons = 0;
479  for(std::size_t i = 0; i != layers.size(); ++i){
480  m_numberOfNeurons += layers[i];
481  }
482  if(biasNeuron){
483  m_bias.resize(m_numberOfNeurons - m_inputNeurons);
484  }
485 
486  if(connectivity == FFNetStructures::Full){
487  //connect to all previous layers.
488  std::size_t numNeurons = layers[0];
489  for(std::size_t i = 0; i != m_layerMatrix.size(); ++i){
490  m_layerMatrix[i].resize(layers[i+1],numNeurons);
491  m_backpropMatrix[i].resize(layers[i],m_numberOfNeurons-numNeurons);
492  numNeurons += layers[i+1];
493 
494  }
495  m_inputOutputShortcut.resize(0,0);
496  }else{
497  //only connect with the previous layer
498  for(std::size_t i = 0; i != m_layerMatrix.size(); ++i){
499  m_layerMatrix[i].resize(layers[i+1],layers[i]);
500  m_backpropMatrix[i].resize(layers[i],layers[i+1]);
501  }
502 
503  //create a shortcut from input to output when desired
504  if(connectivity == FFNetStructures::InputOutputShortcut){
505  m_inputOutputShortcut.resize(m_outputNeurons,m_inputNeurons);
506  }
507  }
508  }
509 
510  //! \brief Creates a connection matrix for a network with a
511  //! single hidden layer
512  //!
513  //! Automatically creates a network with
514  //! three different layers: An input layer with \em in input neurons,
515  //! an output layer with \em out output neurons and one hidden layer
516  //! with \em hidden neurons, respectively.
517  //!
518  //! \param in number of input neurons.
519  //! \param hidden number of neurons of the second hidden layer.
520  //! \param out number of output neurons.
521  //! \param connectivity Type of connectivity between the layers
522  //! \param bias if set to \em true, connections from
523  //! all neurons (except the input neurons)
524  //! to the bias will be set.
526  std::size_t in,
527  std::size_t hidden,
528  std::size_t out,
530  bool bias = true
531  ){
532  std::vector<size_t> layer(3);
533  layer[0] = in;
534  layer[1] = hidden;
535  layer[2] = out;
536  setStructure(layer, connectivity, bias);
537  }
538 
539  //! \brief Creates a connection matrix for a network with two
540  //! hidden layers.
541  //!
542  //! Automatically creates a network with
543  //! four different layers: An input layer with \em in input neurons,
544  //! an output layer with \em out output neurons and two hidden layers
545  //! with \em hidden1 and \em hidden2 hidden neurons, respectively.
546  //!
547  //! \param in number of input neurons.
548  //! \param hidden1 number of neurons of the first hidden layer.
549  //! \param hidden2 number of neurons of the second hidden layer.
550  //! \param out number of output neurons.
551  //! \param connectivity Type of connectivity between the layers
552  //! \param bias if set to \em true, connections from
553  //! all neurons (except the input neurons)
554  //! to the bias will be set.
556  std::size_t in,
557  std::size_t hidden1,
558  std::size_t hidden2,
559  std::size_t out,
561  bool bias = true
562  ){
563  std::vector<size_t> layer(4);
564  layer[0] = in;
565  layer[1] = hidden1;
566  layer[2] = hidden2;
567  layer[3] = out;
568  setStructure(layer, connectivity, bias);
569  }
570 
571  //! From ISerializable, reads a model from an archive
572  void read( InArchive & archive ){
573  archive>>m_inputNeurons;
574  archive>>m_outputNeurons;
575  archive>>m_numberOfNeurons;
576  archive>>m_layerMatrix;
577  archive>>m_backpropMatrix;
578  archive>>m_inputOutputShortcut;
579  archive>>m_bias;
580  }
581 
582  //! From ISerializable, writes a model to an archive
583  void write( OutArchive & archive ) const{
584  archive<<m_inputNeurons;
585  archive<<m_outputNeurons;
586  archive<<m_numberOfNeurons;
587  archive<<m_layerMatrix;
588  archive<<m_backpropMatrix;
589  archive<<m_inputOutputShortcut;
590  archive<<m_bias;
591  }
592 
593 
594 private:
595 
596  void computeDelta(
597  RealMatrix& delta, State const& state, bool computeInputDelta
598  )const{
599  SIZE_CHECK(delta.size1() == numberOfNeurons());
600  InternalState const& s = state.toState<InternalState>();
601 
602  //initialize output neurons using coefficients
603  RealSubMatrix outputDelta = rows(delta,delta.size1()-outputSize(),delta.size1());
604  ConstRealSubMatrix outputResponse = rows(s.responses,delta.size1()-outputSize(),delta.size1());
605  noalias(outputDelta) *= m_outputNeuron.derivative(outputResponse);
606 
607  //iterate backwards using the backprop matrix and propagate the errors to get the needed delta values
608  //we stop until we have filled all delta values. Thus we might not necessarily compute all layers.
609 
610  //last neuron of the current layer that we need to compute
611  //we don't need (or can not) compute the values of the output neurons as they are given from the outside
612  std::size_t endNeuron = delta.size1()-outputSize();
613  std::size_t layer = m_backpropMatrix.size()-1;
614  std::size_t endIndex = computeInputDelta? 0: inputSize();
615  while(endNeuron > endIndex){
616 
617  RealMatrix const& weights = m_backpropMatrix[layer];
618  std::size_t beginNeuron = endNeuron - weights.size1();//first neuron of the current layer
619  //get the delta and response values of this layer
620  RealSubMatrix layerDelta = rows(delta,beginNeuron,endNeuron);
621  RealSubMatrix layerDeltaInput = rows(delta,endNeuron,endNeuron+weights.size2());
622  ConstRealSubMatrix layerResponse = rows(s.responses,beginNeuron,endNeuron);
623 
624  noalias(layerDelta) += prod(weights,layerDeltaInput);//add the values to the maybe non-empty delta part
625  if(layer != 0){
626  noalias(layerDelta) *= m_hiddenNeuron.derivative(layerResponse);
627  }
628  //go a layer backwards
629  endNeuron=beginNeuron;
630  --layer;
631  }
632 
633  //add the shortcut deltas if necessary
634  if(inputOutputShortcut().size1() != 0)
635  noalias(rows(delta,0,inputSize())) += prod(trans(inputOutputShortcut()),outputDelta);
636  }
637 
638  void computeParameterDerivative(RealMatrix const& delta, State const& state, RealVector& gradient)const{
639  SIZE_CHECK(delta.size1() == numberOfNeurons());
640  InternalState const& s = state.toState<InternalState>();
641  // calculate error gradient
642  //todo: take network structure into account to prevent checking all possible weights...
643  gradient.resize(numberOfParameters());
644  std::size_t pos = 0;
645  std::size_t layerStart = inputSize();
646  for(std::size_t layer = 0; layer != layerMatrices().size(); ++layer){
647  std::size_t layerRows = layerMatrices()[layer].size1();
648  std::size_t layerColumns = layerMatrices()[layer].size2();
649  std::size_t params = layerRows*layerColumns;
650  axpy_prod(
651  rows(delta,layerStart,layerStart+layerRows),
652  trans(rows(s.responses,layerStart-layerColumns,layerStart)),
653  //interpret part of the gradient as the weights of the layer
654  to_matrix(subrange(gradient,pos,pos+params),layerRows,layerColumns)
655  );
656  pos += params;
657  layerStart += layerRows;
658  }
659  //check whether we need the bias derivative
660  if(!bias().empty()){
661  //calculate bias derivative
662  for (std::size_t neuron = m_inputNeurons; neuron < m_numberOfNeurons; neuron++){
663  gradient(pos) = sum(row(delta,neuron));
664  pos++;
665  }
666  }
667  //compute shortcut derivative
668  if(inputOutputShortcut().size1() != 0){
669  std::size_t params = inputSize()*outputSize();
670  axpy_prod(
671  rows(delta,delta.size1()-outputSize(),delta.size1()),
672  trans(rows(s.responses,0,inputSize())),
673  to_matrix(subrange(gradient,pos,pos+params),outputSize(),inputSize())
674  );
675  }
676 
677  }
678 
679 
680  //! \brief Number of all network neurons.
681  //!
682  //! This is the total number of neurons in the network, i.e.
683  //! input, hidden and output neurons.
684  std::size_t m_numberOfNeurons;
685  std::size_t m_inputNeurons;
686  std::size_t m_outputNeurons;
687 
688  //! \brief represents the connection matrix using a layered structure for forward propagation
689  //!
690  //! a layer is made of neurons with consecutive indizes which are not
691  //! connected with each other. In other words, if there exists a k i<k<j such
692  //! that C(i,k) = 1 or C(k,j) = 1 or C(j,i) = 1 than the neurons i,j are not in the same layer.
693  //! This is the forward view, meaning that the layers holds the weights which are used to calculate
694  //! the activation of the neurons of the layer.
695  std::vector<RealMatrix> m_layerMatrix;
696 
697  //! \brief optional matrix directly connecting input to output
698  //!
699  //! This is only filled when the ntworkhas an input-output shortcut but not a full layer connection.
700  RealMatrix m_inputOutputShortcut;
701 
702  //!\brief represents the backwards view of the network as layered structure.
703  //!
704  //! This is the backward view of the Network which is used for the backpropagation step. So every
705  //! Matrix contains the weights of the neurons which are activatived by the layer.
706  std::vector<RealMatrix> m_backpropMatrix;
707 
708  //! bias weights of the neurons
709  RealVector m_bias;
710 
711  //!Type of hidden neuron. See Models/Neurons.h for a few choices
712  HiddenNeuron m_hiddenNeuron;
713  //! Type of output neuron. See Models/Neurons.h for a few choices
714  OutputNeuron m_outputNeuron;
715 };
716 
717 
718 }
719 #endif