Rprop.h
Go to the documentation of this file.
1 /*!
2  *
3  *
4  * \brief implements different versions of Resilient Backpropagation of error.
5  *
6  *
7  *
8  *
9  * \author Oswin Krause
10  * \date 2010
11  *
12  *
13  * \par Copyright 1995-2015 Shark Development Team
14  *
15  * <BR><HR>
16  * This file is part of Shark.
17  * <http://image.diku.dk/shark/>
18  *
19  * Shark is free software: you can redistribute it and/or modify
20  * it under the terms of the GNU Lesser General Public License as published
21  * by the Free Software Foundation, either version 3 of the License, or
22  * (at your option) any later version.
23  *
24  * Shark is distributed in the hope that it will be useful,
25  * but WITHOUT ANY WARRANTY; without even the implied warranty of
26  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27  * GNU Lesser General Public License for more details.
28  *
29  * You should have received a copy of the GNU Lesser General Public License
30  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
31  *
32  */
33 
34 #ifndef SHARK_ML_OPTIMIZER_RPROP_H
35 #define SHARK_ML_OPTIMIZER_RPROP_H
36 
37 #include <shark/Core/DLLSupport.h>
39 
40 namespace shark{
41 
42 /*!
43  * \brief This class offers methods for the usage of the
44  * Resilient-Backpropagation-algorithm without weight-backtracking.
45  *
46  * The Rprop algorithm is an improvement of the algorithms with adaptive
47  * learning rates (as the Adaptive Backpropagation algorithm by Silva
48  * and Ameida, please see AdpBP.h for a description of the
49  * working of such an algorithm), that uses increments for the update
50  * of the weights, that are independent from the absolute partial
51  * derivatives. This makes sense, because large flat regions
52  * in the search space (plateaus) lead to small absolute partial
53  * derivatives and so the increments are chosen small, but the increments
54  * should be large to skip the plateau. In contrast, the absolute partial
55  * derivatives are very large at the "slopes" of very "narrow canyons",
56  * which leads to large increments that will skip the minimum lying
57  * at the bottom of the canyon, but it would make more sense to
58  * chose small increments to hit the minimum. <br>
59  * So, the Rprop algorithm only uses the signs of the partial derivatives
60  * and not the absolute values to adapt the parameters. <br>
61  * Instead of individual learning rates, it uses the parameter
62  * \f$\Delta_i^{(t)}\f$ for weight \f$w_i,\ i = 1, \dots, n\f$ in
63  * iteration \f$t\f$, where the parameter will be adapted before the
64  * change of the weights: <br>
65  *
66  * \f$
67  * \Delta_i^{(t)} = \Bigg\{
68  * \begin{array}{ll}
69  * min( \eta^+ \cdot \Delta_i^{(t-1)}, \Delta_{max} ), & \mbox{if\ }
70  * \frac{\partial E^{(t-1)}}{\partial w_i} \cdot
71  * \frac{\partial E^{(t)}}{\partial w_i} > 0 \\
72  * max( \eta^- \cdot \Delta_i^{(t-1)}, \Delta_{min} ), & \mbox{if\ }
73  * \frac{\partial E^{(t-1)}}{\partial w_i} \cdot
74  * \frac{\partial E^{(t)}}{\partial w_i} < 0 \\
75  * \Delta_i^{(t-1)}, & \mbox{otherwise}
76  * \end{array}
77  * \f$
78  *
79  * The parameters \f$\eta^+ > 1\f$ and \f$0 < \eta^- < 1\f$ control
80  * the speed of the adaptation. To stabilize the increments, they are
81  * restricted to the interval \f$[\Delta_{min}, \Delta_{max}]\f$. <br>
82  * After the adaptation of the \f$\Delta_i\f$ the update for the
83  * weights will be calculated as
84  *
85  * \f$
86  * \Delta w_i^{(t)} := - \mbox{sign}
87  * \left( \frac{\partial E^{(t)}}{\partial w_i}\right) \cdot \Delta_i^{(t)}
88  * \f$
89  *
90  * For further information about the algorithm, please refer to: <br>
91  *
92  * Martin Riedmiller, <br>
93  * "Advanced Supervised Learning in Multi-layer Perceptrons -
94  * From Backpropagation to Adaptive Learning Algorithms". <br>
95  * In "International Journal of Computer Standards and Interfaces", volume 16,
96  * no. 5, 1994, pp. 265-278 <br>
97  *
98  * \author C. Igel
99  * \date 1999
100  *
101  * \par Changes:
102  * none
103  *
104  * \par Status:
105  * stable
106  *
107  */
108 class RpropMinus : public AbstractSingleObjectiveOptimizer<RealVector >
109 {
110 public:
112 
113  /// \brief From INameable: return the class name.
114  std::string name() const
115  { return "RpropMinus"; }
116 
117  SHARK_EXPORT_SYMBOL void init(ObjectiveFunctionType& objectiveFunction, SearchPointType const& startingPoint);
118  SHARK_EXPORT_SYMBOL virtual void init(ObjectiveFunctionType& objectiveFunction, SearchPointType const& startingPoint, double initDelta);
120 
121  SHARK_EXPORT_SYMBOL void step(ObjectiveFunctionType const& objectiveFunction);
122 
123  SHARK_EXPORT_SYMBOL virtual void read( InArchive & archive );
124  SHARK_EXPORT_SYMBOL virtual void write( OutArchive & archive ) const;
125 
126  //! set decrease factor
127  void setEtaMinus(double etaMinus) {
128  RANGE_CHECK( etaMinus < 1 );
129  RANGE_CHECK( etaMinus > 0 );
130  m_decreaseFactor = etaMinus;
131  }
132 
133  //! set increase factor
134  void setEtaPlus(double etaPlus) {
135  RANGE_CHECK( etaPlus > 1 );
136  m_increaseFactor = etaPlus;
137  }
138 
139  //! set upper limit on update
140  void setMaxDelta(double d) {
141  RANGE_CHECK( d > 0 );
142  m_maxDelta = d;
143  }
144 
145  //! set lower limit on update
146  void setMinDelta(double d) {
147  RANGE_CHECK( d >= 0 );
148  m_minDelta = d;
149  }
150 
151  //! return the maximal step size component
152  double maxDelta() const {
153  return *std::max_element(m_delta.begin(),m_delta.end());
154  }
155 protected:
157 
158  //! The increase factor \f$ \eta^+ \f$, set to 1.2 by default.
160 
161  //! The decrease factor \f$ \eta^- \f$, set to 0.5 by default.
163 
164  //! The upper limit of the increments \f$ \Delta w_i^{(t)} \f$, set to 1e100 by default.
165  double m_maxDelta;
166 
167  //! The lower limit of the increments \f$ \Delta w_i^{(t)} \f$, set to 0.0 by default.
168  double m_minDelta;
169 
171 
172  //! The last error gradient.
173  RealVector m_oldDerivative;
174 
175  //! The absolute update values (increment) for all weights.
176  RealVector m_delta;
177 };
178 
179 //===========================================================================
180 /*!
181  * \brief This class offers methods for the usage of the
182  * Resilient-Backpropagation-algorithm with weight-backtracking.
183  *
184  * The Rprop algorithm is an improvement of the algorithms with adaptive
185  * learning rates (as the Adaptive Backpropagation algorithm by Silva
186  * and Ameida, please see AdpBP.h for a description of the
187  * working of such an algorithm), that uses increments for the update
188  * of the weights, that are independent from the absolute partial
189  * derivatives. This makes sense, because large flat regions
190  * in the search space (plateaus) lead to small absolute partial
191  * derivatives and so the increments are chosen small, but the increments
192  * should be large to skip the plateau. In contrast, the absolute partial
193  * derivatives are very large at the "slopes" of very "narrow canyons",
194  * which leads to large increments that will skip the minimum lying
195  * at the bottom of the canyon, but it would make more sense to
196  * chose small increments to hit the minimum. <br>
197  * So, the Rprop algorithm only uses the signs of the partial derivatives
198  * and not the absolute values to adapt the parameters. <br>
199  * Instead of individual learning rates, it uses the parameter
200  * \f$\Delta_i^{(t)}\f$ for weight \f$w_i,\ i = 1, \dots, n\f$ in
201  * iteration \f$t\f$, where the parameter will be adapted before the
202  * change of the weights: <br>
203  *
204  * \f$
205  * \Delta_i^{(t)} = \Bigg\{
206  * \begin{array}{ll}
207  * min( \eta^+ \cdot \Delta_i^{(t-1)}, \Delta_{max} ), & \mbox{if\ }
208  * \frac{\partial E^{(t-1)}}{\partial w_i} \cdot
209  * \frac{\partial E^{(t)}}{\partial w_i} > 0 \\
210  * max( \eta^- \cdot \Delta_i^{(t-1)}, \Delta_{min} ), & \mbox{if\ }
211  * \frac{\partial E^{(t-1)}}{\partial w_i} \cdot
212  * \frac{\partial E^{(t)}}{\partial w_i} < 0 \\
213  * \Delta_i^{(t-1)}, & \mbox{otherwise}
214  * \end{array}
215  * \f$
216  *
217  * The parameters \f$\eta^+ > 1\f$ and \f$0 < \eta^- < 1\f$ control
218  * the speed of the adaptation. To stabilize the increments, they are
219  * restricted to the interval \f$[\Delta_{min}, \Delta_{max}]\f$. <br>
220  * After the adaptation of the \f$\Delta_i\f$ the update for the
221  * weights will be calculated as
222  *
223  * \f$
224  * \Delta w_i^{(t)} := - \mbox{sign}
225  * \left( \frac{\partial E^{(t)}}{\partial w_i}\right) \cdot \Delta_i^{(t)}
226  * \f$
227  *
228  * Furthermore, weight-backtracking will take place to increase the
229  * stability of the method, i.e.
230  * if \f$\frac{\partial E^{(t-1)}}{\partial w_i} \cdot
231  * \frac{\partial E^{(t)}}{\partial w_i} < 0\f$ then
232  * \f$\Delta w_i^{(t)} := - \Delta w_i^{(t-1)};
233  * \frac{\partial E^{(t)}}{\partial w_i} := 0\f$, where
234  * the assignment of zero to the partial derivative of the error
235  * leads to a freezing of the increment in the next iteration.
236  *
237  * For further information about the algorithm, please refer to: <br>
238  *
239  * Martin Riedmiller and Heinrich Braun, <br>
240  * "A Direct Adaptive Method for Faster Backpropagation Learning: The
241  * RPROP Algorithm". <br>
242  * In "Proceedings of the IEEE International Conference on Neural Networks",
243  * pp. 586-591, <br>
244  * Published by IEEE Press in 1993
245  *
246  * \author C. Igel
247  * \date 1999
248  *
249  * \par Changes:
250  * none
251  *
252  * \par Status:
253  * stable
254  *
255  */
256 class RpropPlus : public RpropMinus
257 {
258 public:
260 
261  /// \brief From INameable: return the class name.
262  std::string name() const
263  { return "RpropPlus"; }
264 
265  SHARK_EXPORT_SYMBOL void init(ObjectiveFunctionType& objectiveFunction, SearchPointType const& startingPoint);
266  SHARK_EXPORT_SYMBOL void init(ObjectiveFunctionType& objectiveFunction, SearchPointType const& startingPoint, double initDelta);
268 
269  SHARK_EXPORT_SYMBOL void step(ObjectiveFunctionType const& objectiveFunction);
270  SHARK_EXPORT_SYMBOL void read( InArchive & archive );
271  SHARK_EXPORT_SYMBOL void write( OutArchive & archive ) const;
272 
273 protected:
274  //! The final update values for all weights.
275  RealVector m_deltaw;
276 };
277 
278 
279 
280 
281 /*!
282  * \brief This class offers methods for the usage of the improved
283  * Resilient-Backpropagation-algorithm with weight-backtracking.
284  *
285  * The Rprop algorithm is an improvement of the algorithms with adaptive
286  * learning rates (as the Adaptive Backpropagation algorithm by Silva
287  * and Ameida, please see AdpBP.h for a description of the
288  * working of such an algorithm), that uses increments for the update
289  * of the weights, that are independent from the absolute partial
290  * derivatives. This makes sense, because large flat regions
291  * in the search space (plateaus) lead to small absolute partial
292  * derivatives and so the increments are chosen small, but the increments
293  * should be large to skip the plateau. In contrast, the absolute partial
294  * derivatives are very large at the "slopes" of very "narrow canyons",
295  * which leads to large increments that will skip the minimum lying
296  * at the bottom of the canyon, but it would make more sense to
297  * chose small increments to hit the minimum. <br>
298  * So, the Rprop algorithm only uses the signs of the partial derivatives
299  * and not the absolute values to adapt the parameters. <br>
300  * Instead of individual learning rates, it uses the parameter
301  * \f$\Delta_i^{(t)}\f$ for weight \f$w_i,\ i = 1, \dots, n\f$ in
302  * iteration \f$t\f$, where the parameter will be adapted before the
303  * change of the weights: <br>
304  *
305  * \f$
306  * \Delta_i^{(t)} = \Bigg\{
307  * \begin{array}{ll}
308  * min( \eta^+ \cdot \Delta_i^{(t-1)}, \Delta_{max} ), & \mbox{if\ }
309  * \frac{\partial E^{(t-1)}}{\partial w_i} \cdot
310  * \frac{\partial E^{(t)}}{\partial w_i} > 0 \\
311  * max( \eta^- \cdot \Delta_i^{(t-1)}, \Delta_{min} ), & \mbox{if\ }
312  * \frac{\partial E^{(t-1)}}{\partial w_i} \cdot
313  * \frac{\partial E^{(t)}}{\partial w_i} < 0 \\
314  * \Delta_i^{(t-1)}, & \mbox{otherwise}
315  * \end{array}
316  * \f$
317  *
318  * The parameters \f$\eta^+ > 1\f$ and \f$0 < \eta^- < 1\f$ control
319  * the speed of the adaptation. To stabilize the increments, they are
320  * restricted to the interval \f$[\Delta_{min}, \Delta_{max}]\f$. <br>
321  * After the adaptation of the \f$\Delta_i\f$ the update for the
322  * weights will be calculated as
323  *
324  * \f$
325  * \Delta w_i^{(t)} := - \mbox{sign}
326  * \left( \frac{\partial E^{(t)}}{\partial w_i}\right) \cdot \Delta_i^{(t)}
327  * \f$
328  *
329  * Furthermore, weight-backtracking will take place to increase the
330  * stability of the method. In contrast to the original Rprop algorithm
331  * with weight-backtracking (see RpropPlus) this weight-backtracking
332  * is improved by additionally taken the error of the last iteration
333  * \f$t - 1\f$ into account. <br>
334  * The idea of this modification is, that a change of the sign of the
335  * partial derivation \f$\frac{\partial E}{\partial w_i}\f$
336  * only states, that a minimum was skipped and not, whether this step
337  * lead to an approach to the minimum or not. <br>
338  * By using the old error value the improved weight-backtracking only
339  * undoes changes, when the error has increased and only the parameters
340  * \f$w_i\f$ are reset to the old values, where a sign change of
341  * \f$\frac{\partial E}{\partial w_i}\f$ has taken place. <br>
342  * So the new weight-backtracking rule is: <br>
343  *
344  * \f$
345  * \mbox{if\ } \frac{\partial E^{(t-1)}}{\partial w_i} \cdot
346  * \frac{\partial E^{(t)}}{\partial w_i} < 0 \mbox{\ then} \{
347  * \f$
348  *
349  * \f$
350  * \begin{array}{lll}
351  * \Delta w_i^{(t)} = \bigg\{ &
352  * - \Delta w_i^{(t-1)}, & \mbox{if\ } E^{(t)} > E^{(t - 1)} \\
353  * & 0, & otherwise \\
354  * \frac{\partial E^{(t)}}{\partial w_i} := 0
355  * \end{array}
356  * \f$
357  *
358  * \f$\}\f$
359  *
360  * , where the assignment of zero to the partial derivative of the error
361  * leads to a freezing of the increment in the next iteration. <br>
362  *
363  * This modification of the weight backtracking leads to a better
364  * optimization on artifical, paraboloidal error surfaces. <br>
365  *
366  * For further information about the algorithm, please refer to: <br>
367  *
368  * Christian Igel and Michael H&uuml;sken, <br>
369  * "Empirical Evaluation of the Improved Rprop Learning Algorithm". <br>
370  * In Neurocomputing Journal, 2002, in press <br>
371  *
372  * \author C. Igel, M. H&uuml;sken
373  * \date 1999
374  *
375  * \par Changes:
376  * none
377  *
378  * \par Status:
379  * stable
380  *
381  */
382 class IRpropPlus : public RpropPlus
383 {
384 public:
386 
387  /// \brief From INameable: return the class name.
388  std::string name() const
389  { return "IRpropPlus"; }
390 
391  SHARK_EXPORT_SYMBOL void init(ObjectiveFunctionType& objectiveFunction, SearchPointType const& startingPoint);
392  SHARK_EXPORT_SYMBOL void init(ObjectiveFunctionType& objectiveFunction, SearchPointType const& startingPoint, double initDelta);
394 
395  SHARK_EXPORT_SYMBOL void step(ObjectiveFunctionType const& objectiveFunction);
396 
397  SHARK_EXPORT_SYMBOL void setDerivativeThreshold(double derivativeThreshold);
398 
399  SHARK_EXPORT_SYMBOL void read( InArchive & archive );
400  SHARK_EXPORT_SYMBOL void write( OutArchive & archive ) const;
401 
402 protected:
403  //! The error of the last iteration.
404  double m_oldError;
405  //! A threshold below which partial derivatives are set to zero
407 
408 };
409 
410 
411 class IRpropPlusFull : public RpropPlus
412 {
413 public:
415 
416  /// \brief From INameable: return the class name.
417  std::string name() const
418  { return "IRpropPlusFull"; }
419 
420  SHARK_EXPORT_SYMBOL void init(ObjectiveFunctionType& objectiveFunction, SearchPointType const& startingPoint);
421  SHARK_EXPORT_SYMBOL void init(ObjectiveFunctionType& objectiveFunction, SearchPointType const& startingPoint, double initDelta);
423 
424  SHARK_EXPORT_SYMBOL void step(ObjectiveFunctionType const& objectiveFunction);
425 
426  SHARK_EXPORT_SYMBOL void setDerivativeThreshold(double derivativeThreshold);
427 
428  SHARK_EXPORT_SYMBOL void read( InArchive & archive );
429  SHARK_EXPORT_SYMBOL void write( OutArchive & archive ) const;
430 
431 protected:
432  //! The error of the last iteration.
433  double m_oldError;
434  //! A threshold below which partial derivatives are set to zero
436 
437 };
438 
439 //===========================================================================
440 /*!
441  * \brief This class offers methods for the usage of the improved
442  * Resilient-Backpropagation-algorithm without weight-backtracking.
443  *
444  * The Rprop algorithm is an improvement of the algorithms with adaptive
445  * learning rates (as the Adaptive Backpropagation algorithm by Silva
446  * and Ameida, please see AdpBP.h for a description of the
447  * working of such an algorithm), that uses increments for the update
448  * of the weights, that are independent from the absolute partial
449  * derivatives. This makes sense, because large flat regions
450  * in the search space (plateaus) lead to small absolute partial
451  * derivatives and so the increments are chosen small, but the increments
452  * should be large to skip the plateau. In contrast, the absolute partial
453  * derivatives are very large at the "slopes" of very "narrow canyons",
454  * which leads to large increments that will skip the minimum lying
455  * at the bottom of the canyon, but it would make more sense to
456  * chose small increments to hit the minimum. <br>
457  * So, the Rprop algorithm only uses the signs of the partial derivatives
458  * and not the absolute values to adapt the parameters. <br>
459  * Instead of individual learning rates, it uses the parameter
460  * \f$\Delta_i^{(t)}\f$ for weight \f$w_i,\ i = 1, \dots, n\f$ in
461  * iteration \f$t\f$, where the parameter will be adapted before the
462  * change of the weights. <br>
463  * As an improving modification, this algorithm
464  * adapts the "freezing" of the increment in the next iteration as
465  * usually only practiced by the Rprop algorithm with weight-backtracking
466  * (see RpropPlus), i.e. \f$\frac{\partial E^{(t)}}{\partial w_i} := 0\f$.
467  * Tests have shown a far more better optimization when using this
468  * modification. So the new adaptation rule of \f$\Delta\f$ is given
469  * as: <br>
470  *
471  * \f$
472  * \Delta_i^{(t)} = \Bigg\{
473  * \begin{array}{ll}
474  * min( \eta^+ \cdot \Delta_i^{(t-1)}, \Delta_{max} ), & \mbox{if\ }
475  * \frac{\partial E^{(t-1)}}{\partial w_i} \cdot
476  * \frac{\partial E^{(t)}}{\partial w_i} > 0 \\
477  * max( \eta^- \cdot \Delta_i^{(t-1)}, \Delta_{min} );
478  * \frac{\partial E^{(t)}}{\partial w_i} := 0, & \mbox{if\ }
479  * \frac{\partial E^{(t-1)}}{\partial w_i} \cdot
480  * \frac{\partial E^{(t)}}{\partial w_i} < 0 \\
481  * \Delta_i^{(t-1)}, & \mbox{otherwise}
482  * \end{array}
483  * \f$
484  *
485  * The parameters \f$\eta^+ > 1\f$ and \f$0 < \eta^- < 1\f$ control
486  * the speed of the adaptation. To stabilize the increments, they are
487  * restricted to the interval \f$[\Delta_{min}, \Delta_{max}]\f$. <br>
488  * After the adaptation of the \f$\Delta_i\f$ the update for the
489  * weights will be calculated as
490  *
491  * \f$
492  * \Delta w_i^{(t)} := - \mbox{sign}
493  * \left( \frac{\partial E^{(t)}}{\partial w_i}\right) \cdot \Delta_i^{(t)}
494  * \f$
495  *
496  * For further information about the algorithm, please refer to: <br>
497  *
498  * Christian Igel and Michael H&uuml;sken, <br>
499  * "Empirical Evaluation of the Improved Rprop Learning Algorithm". <br>
500  * In Neurocomputing Journal, 2002, in press <br>
501  *
502  *
503  * \author C. Igel, M. H&uuml;sken
504  * \date 1999
505  *
506  * \par Changes:
507  * none
508  *
509  * \par Status:
510  * stable
511  *
512  */
513 class IRpropMinus : public RpropMinus {
514 public:
516 
517  /// \brief From INameable: return the class name.
518  std::string name() const
519  { return "IRpropMinus"; }
520 
521  SHARK_EXPORT_SYMBOL void step(ObjectiveFunctionType const& objectiveFunction);
522 };
523 
524 //! Used to connect the class names with the year of
525 //! publication of the paper in which the algorithm was introduced.
527 
528 //! Used to connect the class names with the year of
529 //! publication of the paper in which the algorithm was introduced.
531 
532 //! Used to connect the class names with the year of
533 //! publication of the paper in which the algorithm was introduced.
535 
536 //! Used to connect the class names with the year of
537 //! publication of the paper in which the algorithm was introduced.
539 }
540 
541 #endif
542