LMCMA.h
Go to the documentation of this file.
1 /*!
2  * \brief -
3  *
4  * \author Thomas Voss and Christian Igel
5  * \date April 2014
6  *
7  * \par Copyright 1995-2015 Shark Development Team
8  *
9  * <BR><HR>
10  * This file is part of Shark.
11  * <http://image.diku.dk/shark/>
12  *
13  * Shark is free software: you can redistribute it and/or modify
14  * it under the terms of the GNU Lesser General Public License as published
15  * by the Free Software Foundation, either version 3 of the License, or
16  * (at your option) any later version.
17  *
18  * Shark is distributed in the hope that it will be useful,
19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21  * GNU Lesser General Public License for more details.
22  *
23  * You should have received a copy of the GNU Lesser General Public License
24  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
25  *
26  */
27 #ifndef SHARK_ALGORITHMS_DIRECT_SEARCH_LM_CMA_H
28 #define SHARK_ALGORITHMS_DIRECT_SEARCH_LM_CMA_H
29 
33 
38 
39 
40 
41 namespace shark {
42 
43 namespace detail{
44 
45 ///\brief Approximates a Limited Memory Cholesky Matrix from a stream of samples.
46 ///
47 /// Given a set of points \f$ v_i\f$, produces an approximation of the cholesky factor of a matrix:
48 /// \f[ AA^T=C= (1-\alpha) C^{t-1} + \alpha* x_{j_t} x_{j_t}^T \f]
49 /// here the \f$ j_t \f$ are chosen such to have an approximate distance \f$ N_{steps} \f$. It is assumed
50 /// that the \f$x_i \f$ are correlated and thus a big \f$ N_{steps} \f$ tris to get points which are less
51 /// correlated. The matrix keeps a set of vectors and decides at every step which is will discard.
52 ///
53 /// This is the corrected algorithm as proposed in
54 /// Ilya Loshchilov, "A Computationally Efficient Limited Memory CMA-ES for Large Scale Optimization"
55 ///
56 class IncrementalCholeskyMatrix{
57 public:
58  IncrementalCholeskyMatrix(){}
59  void init (double alpha,std::size_t dimensions, std::size_t numVectors, std::size_t Nsteps){
60  m_vArr.resize(numVectors,dimensions);
61  m_pcArr.resize(numVectors,dimensions);
62  m_b.resize(numVectors);
63  m_d.resize(numVectors);
64  m_l.resize(numVectors);
65  m_j.resize(0);//nothing stored at the bginning
66  m_Nsteps = Nsteps;
67  m_maxStoredVectors = numVectors;
68  m_counter = 0;
69  m_alpha = alpha;
70 
71  m_vArr.clear();
72  m_pcArr.clear();
73  m_b.clear();
74  m_d.clear();
75  m_l.clear();
76  }
77 
78  //computes x = Az
79  template<class T>
80  void prod(RealVector& x, T const& z)const{
81  x = z;
82  double a = std::sqrt(1-m_alpha);
83  for(std::size_t j=0; j != m_j.size(); j++){
84  std::size_t jcur = m_j[j];
85  double k = m_b(jcur) *inner_prod(row(m_vArr,jcur),z);
86  noalias(x) = a*x+k*row(m_pcArr,jcur);
87  }
88  }
89 
90  //computes x= A^{-1}z
91  template<class T>
92  void inv(RealVector& x, T const& z)const{
93  inv(x,z,m_j.size());
94  }
95 
96  void update(RealVector const& newPc){
97  std::size_t imin = 0;//the index of the removed point
98  if (m_j.size() < m_maxStoredVectors)
99  {
100  std::size_t index = m_j.size();
101  m_j.push_back(index);
102  imin = index;
103  }
104  else
105  {
106  //find the largest "age"gap between neighbouring points (i.e. the time between insertion)
107  //we want to remove the smallest gap as to make the
108  //time distances as equal as possible
109  std::size_t dmin = m_l[m_j[1]] - m_l[m_j[0]];
110  imin = 1;
111  for(std::size_t j=2; j != m_j.size(); j++)
112  {
113  std::size_t dcur = m_l[m_j[j]] - m_l[m_j[j-1]];
114  if (dcur < dmin)
115  {
116  dmin = dcur;
117  imin = j;
118  }
119  }
120  //if the gap is bigger than Nsteps, we remove the oldest point to
121  //shrink it.
122  if (dmin >= m_Nsteps)
123  imin = 0;
124  //we push all points backwards and append the freed index to the end of the list
125  if (imin != m_j.size()-1)
126  {
127  std::size_t sav = m_j[imin];
128  for(std::size_t j = imin; j != m_j.size()-1; j++)
129  m_j[j] = m_j[j+1];
130  m_j.back() = sav;
131  }
132  }
133  //set the values of the new added index
134  int newidx = m_j.back();
135  m_l[newidx] = m_counter;
136  noalias(row(m_pcArr,newidx)) = newPc;
137  ++m_counter;
138 
139  // this procedure recomputes v vectors correctly, in the original LM-CMA-ES they were outdated/corrupted.
140  // all vectors v_k,v_{k+1},...,v_m are corrupted where k=j_imin. it also computes the proper v and b/d values for the newest
141  // inserted vector
142  RealVector v;
143  for(std::size_t i = imin; i != m_j.size(); ++i)
144  {
145  int index = m_j[i];
146  inv(v,row(m_pcArr,index),i);
147  noalias(row(m_vArr,index)) = v;
148 
149  double normv2 = norm_sqr(row(m_vArr,index));
150  double c = std::sqrt(1.0-m_alpha);
151  double f = std::sqrt(1+m_alpha/(1-m_alpha)*normv2);
152  m_b[index] = c/normv2*(f-1);
153  m_d[index] = 1/(c*normv2)*(1-1/f);
154  }
155  }
156 
157 private:
158  template<class T>
159  void inv(RealVector& x, T const& z,std::size_t k)const{
160  x = z;
161  double c= 1.0/std::sqrt(1-m_alpha);
162  for(std::size_t j=0; j != k; j++){// O(m*n)
163  std::size_t jcur = m_j[j];
164  double k = m_d(jcur) * inner_prod(row(m_vArr,jcur),x);
165  noalias(x) = c*x - k*row(m_vArr,jcur);
166  }
167  }
168 
169  //variables making up A
170  RealMatrix m_vArr;
171  RealMatrix m_pcArr;
172  RealVector m_b;
173  RealVector m_d;
174 
175  //index variables for computation of A
176  std::vector<std::size_t> m_j;
177  std::vector<std::size_t> m_l;
178  std::size_t m_Nsteps;
179  std::size_t m_maxStoredVectors;
180  std::size_t m_counter;
181 
182  double m_alpha;
183 };
184 }
185 
186 /// \brief Implements a Limited-Memory-CMA
187 ///
188 /// This is the algorithm as proposed in
189 /// Ilya Loshchilov, "A Computationally Efficient Limited Memory CMA-ES for Large Scale Optimization"
190 /// with a few corrections regarding the covariance matrix update.
191 ///
192 /// The algorithm stores a subset of previous evolution path vectors and approximates the covariance
193 /// matrix based on this. This algorithm only requires O(nm) memory, where n is the dimensionality
194 /// and n the problem dimensionality. To be more exact, 2*m vectors of size n are stored to calculate
195 /// the matrix-vector product with the choelsky factor of the covariance matrix in O(mn).
196 ///
197 /// The algorithm uses the population based step size adaptation strategy as proposed in
198 /// the same paper.
199 class LMCMA: public AbstractSingleObjectiveOptimizer<RealVector >
200 {
201 public:
202  /// \brief Default c'tor.
203  LMCMA(DefaultRngType& rng = Rng::globalRng):mpe_rng(&rng){
204  m_features |= REQUIRES_VALUE;
205  }
206 
207 
208  /// \brief From INameable: return the class name.
209  std::string name() const
210  { return "LMCMA-ES"; }
211 
212  /// \brief Calculates lambda for the supplied dimensionality n.
213  unsigned suggestLambda( unsigned int dimension ) {
214  unsigned lambda = unsigned( 4. + ::floor( 3. * ::log( static_cast<double>( dimension ) ) ) );
215  // heuristic for small search spaces
216  lambda = std::max<unsigned int>( 5, std::min( lambda, dimension ) );
217  return( lambda );
218  }
219 
220  /// \brief Calculates mu for the supplied lambda and the recombination strategy.
221  double suggestMu( unsigned int lambda) {
222  return lambda / 2.;
223  }
224 
226 
227  /// \brief Initializes the algorithm for the supplied objective function.
228  void init( ObjectiveFunctionType& function, SearchPointType const& p) {
229  unsigned int lambda = suggestLambda( p.size() );
230  unsigned int mu = suggestMu( lambda );
231  init( function,
232  p,
233  lambda,
234  mu,
235  1.0/std::sqrt(double(p.size()))
236  );
237  }
238 
239  /// \brief Initializes the algorithm for the supplied objective function.
240  void init(
241  ObjectiveFunctionType& function,
242  SearchPointType const& initialSearchPoint,
243  unsigned int lambda,
244  double mu,
245  double initialSigma
246  ) {
247  checkFeatures(function);
248  function.init();
249 
250  m_numberOfVariables = function.numberOfVariables();
251  m_lambda = lambda;
252  m_mu = static_cast<unsigned int>(::floor(mu));
253 
254  //set initial point
255  m_mean = initialSearchPoint;
256  m_best.point = initialSearchPoint;
257  m_best.value = function(initialSearchPoint);
258 
259  //init step size adaptation
260  m_stepSize.init(initialSigma);
261 
262  //weighting of the mu-best individuals
263  m_weights.resize(m_mu);
264  for (unsigned int i = 0; i < m_mu; i++){
265  m_weights(i) = ::log(mu + 0.5) - ::log(1. + i);
266  }
267  m_weights /= sum(m_weights);
268 
269  // learning rates
270  m_muEff = 1. / sum(sqr(m_weights)); // equal to sum(m_weights)^2 / sum(sqr(m_weights))
271  double c1 = 1/(10*std::log(m_numberOfVariables+1.0));
272  m_cC =1.0/m_lambda;
273 
274  //init variables for covariance matrix update
275  m_evolutionPathC = blas::repeat(0.0,m_numberOfVariables);
276  m_A.init(c1,m_numberOfVariables,lambda,lambda);
277  }
278 
279  /// \brief Executes one iteration of the algorithm.
280  void step(ObjectiveFunctionType const& function){
281 
282  std::vector< Individual<RealVector, double, RealVector> > offspring( m_lambda );
283 
284  PenalizingEvaluator penalizingEvaluator;
285  for( unsigned int i = 0; i < offspring.size(); i++ ) {
286  createSample(offspring[i].searchPoint(),offspring[i].chromosome());
287  }
288  penalizingEvaluator( function, offspring.begin(), offspring.end() );
289 
290  // Selection and parameter update
291  // opposed to normal CMA selection, we don't remove any indidivudals but only order
292  // them by rank to allow the use of the population based strategy.
293  std::vector< Individual<RealVector, double, RealVector> > parents( lambda() );
295  selection(offspring.begin(),offspring.end(),parents.begin(), parents.end());
296  updateStrategyParameters( parents );
297 
298  //update the best solution found so far.
299  m_best.point= parents[ 0 ].searchPoint();
300  m_best.value= parents[ 0 ].unpenalizedFitness();
301  }
302 
303  /// \brief Accesses the current step size.
304  double sigma() const {
305  return m_stepSize.stepSize();
306  }
307 
308  /// \brief Accesses the current population mean.
309  RealVector const& mean() const {
310  return m_mean;
311  }
312 
313  /// \brief Accesses the current weighting vector.
314  RealVector const& weights() const {
315  return m_weights;
316  }
317 
318  /// \brief Accesses the evolution path for the covariance matrix update.
319  RealVector const& evolutionPath() const {
320  return m_evolutionPathC;
321  }
322 
323  /// \brief Returns the size of the parent population \f$\mu\f$.
324  unsigned int mu() const {
325  return m_mu;
326  }
327 
328  /// \brief Returns a mutabl rference to the size of the parent population \f$\mu\f$.
329  unsigned int& mu(){
330  return m_mu;
331  }
332 
333  /// \brief Returns a immutable reference to the size of the offspring population \f$\mu\f$.
334  unsigned int lambda()const{
335  return m_lambda;
336  }
337 
338  /// \brief Returns a mutable reference to the size of the offspring population \f$\mu\f$.
339  unsigned int & lambda(){
340  return m_lambda;
341  }
342 
343 private:
344  /// \brief Updates the strategy parameters based on the supplied offspring population.
345  void updateStrategyParameters( std::vector<Individual<RealVector, double, RealVector> > const& offspring ) {
346  //line 8, creation of the new mean (but not updating the mean of the distribution
347  RealVector m( m_numberOfVariables, 0. );
348  for( unsigned int j = 0; j < mu(); j++ ){
349  noalias(m) += m_weights( j ) * offspring[j].searchPoint();
350  }
351 
352  //update evolution path, line 9
353  noalias(m_evolutionPathC) = (1. - m_cC ) * m_evolutionPathC + std::sqrt( m_cC * (2. - m_cC) * m_muEff ) * (m - m_mean) / sigma();
354 
355  //update mean now, as oldmean is not needed any more (line 8 continued)
356  m_mean = m;
357 
358  //corrected version of lines 10-14- the covariance matrix adaptation
359  //we replace one vector that makes up the approximation of A by the newly updated evolution path
360  m_A.update(m_evolutionPathC);
361 
362  //update the step size using the population success rule, line 15-18
363  m_stepSize.update(offspring);
364 
365  }
366 
367  /// \brief Creates a vector-sample pair x=Az, where z is a gaussian random vector.
368  void createSample(RealVector& x,RealVector& z)const{
369  x.resize(m_numberOfVariables);
370  z.resize(m_numberOfVariables);
371  for(std::size_t i = 0; i != m_numberOfVariables; ++i){
372  z(i) = gauss(*mpe_rng,0,1);
373  }
374  m_A.prod(x,z);
375  noalias(x) = sigma()*x +m_mean;
376  }
377 
378  unsigned int m_numberOfVariables; ///< Stores the dimensionality of the search space.
379  unsigned int m_mu; ///< The size of the parent population.
380  unsigned int m_lambda; ///< The size of the offspring population, needs to be larger than mu.
381 
382  double m_cC;///< learning rate of the evolution path
383 
384 
385  detail::IncrementalCholeskyMatrix m_A;
386  PopulationBasedStepSizeAdaptation m_stepSize;///< step size adaptation for the step size sigma()
387 
388  RealVector m_mean; ///< current mean of the distribution
389  RealVector m_weights;///< weighting for the mu best individuals
390  double m_muEff;///< effective sample size for the weighted samples
391 
392  RealVector m_evolutionPathC;///< evolution path
393 
394  DefaultRngType* mpe_rng;
395 
396 };
397 
398 }
399 
400 #endif