KernelBudgetedSGDTrainer.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Budgeted stochastic gradient descent training for kernel-based models.
6  *
7  * \par This is an implementation of the BSGD algorithm, developed by
8  * Wang, Crammer and Vucetic: Breaking the curse of kernelization:
9  * Budgeted stochastic gradient descent for large-scale SVM training, JMLR 2012.
10  * Basically this is pegasos, so something similar to a perceptron. The main
11  * difference is that we do restrict the sparsity of the weight vector to a (currently
12  * predefined) value. Therefore, whenever this sparsity is reached, we have to
13  * decide how to add a new vector to the model, without destroying this
14  * sparsity. Several methods have been proposed for this, Wang et al. main
15  * insight is that merging two budget vectors (i.e. two vectors in the model).
16  * If the first one is searched by norm of its alpha coefficient, the second one
17  * can be found by some optimization problem, yielding a roughly optimal pair.
18  * This pair can be merged and by doing so the budget has now space for a
19  * new vector. Such strategies are called budget maintenance strategies.
20  *
21  * \par This implementation owes much to the 'reference' implementation
22  * in the BudgetedSVM software.
23  *
24  *
25  * \author T. Glasmachers, Aydin Demircioglu
26  * \date 2014
27  *
28  *
29  * \par Copyright 1995-2015 Shark Development Team
30  *
31  * <BR><HR>
32  * This file is part of Shark.
33  * <http://image.diku.dk/shark/>
34  *
35  * Shark is free software: you can redistribute it and/or modify
36  * it under the terms of the GNU Lesser General Public License as published
37  * by the Free Software Foundation, either version 3 of the License, or
38  * (at your option) any later version.
39  *
40  * Shark is distributed in the hope that it will be useful,
41  * but WITHOUT ANY WARRANTY; without even the implied warranty of
42  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
43  * GNU Lesser General Public License for more details.
44  *
45  * You should have received a copy of the GNU Lesser General Public License
46  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
47  *
48  */
49 //===========================================================================
50 
51 
52 #ifndef SHARK_ALGORITHMS_KERNELBUDGETEDSGDTRAINER_H
53 #define SHARK_ALGORITHMS_KERNELBUDGETEDSGDTRAINER_H
54 
55 #include <iostream>
57 
66 
67 
68 namespace shark
69 {
70 
71 
72 ///
73 /// \brief Budgeted stochastic gradient descent training for kernel-based models.
74 ///
75 /// \par This is an implementation of the BSGD algorithm, developed by
76 /// Wang, Crammer and Vucetic: Breaking the curse of kernelization:
77 /// Budgeted stochastic gradient descent for large-scale SVM training, JMLR 2012.
78 /// Basically this is pegasos, so something similar to a perceptron. The main
79 /// difference is that we do restrict the sparsity of the weight vector to a (currently
80 /// predefined) value. Therefore, whenever this sparsity is reached, we have to
81 /// decide how to add a new vector to the model, without destroying this
82 /// sparsity. Several methods have been proposed for this, Wang et al. main
83 /// insight is that merging two budget vectors (i.e. two vectors in the model).
84 /// If the first one is searched by norm of its alpha coefficient, the second one
85 /// can be found by some optimization problem, yielding a roughly optimal pair.
86 /// This pair can be merged and by doing so the budget has now space for a
87 /// new vector. Such strategies are called budget maintenance strategies.
88 ///
89 /// \par This implementation owes much to the 'reference' implementation
90 /// in the BudgetedSVM software.
91 ///
92 /// \par For the documentation of the basic SGD algorithm, please refer to
93 /// KernelSGDTrainer.h. Note that we did not take over the special alpha scaling
94 /// from that class. Therefore this class is perhaps numerically not as robust as SGD.
95 ///
96 template <class InputType, class CacheType = float>
97 class KernelBudgetedSGDTrainer : public AbstractTrainer< KernelClassifier<InputType> >, public IParameterizable
98 {
99 public:
106  typedef CacheType QpFloatType;
108 
111 
112 
113 
114  /// preinitialization methods
115  enum preInitializationMethod {NONE, RANDOM}; // TODO: add KMEANS
116 
117 
118 
119  /// \brief Constructor
120  /// Note that there is no cache size involved, as merging vectors will always create new ones,
121  /// which makes caching roughly obsolete.
122  ///
123  /// \param[in] kernel kernel function to use for training and prediction
124  /// \param[in] loss (sub-)differentiable loss function
125  /// \param[in] C regularization parameter - always the 'true' value of C, even when unconstrained is set
126  /// \param[in] offset whether to train with offset/bias parameter or not
127  /// \param[in] unconstrained when a C-value is given via setParameter, should it be piped through the exp-function before using it in the solver?
128  /// \param[in] budgetSize size of the budget/model that the final solution will have. Note that it might be smaller though.
129  /// \param[in] budgetMaintenanceStrategy object that contains the logic for maintaining the budget size.
130  /// \param[in] epochs number of epochs the SGD solver should run. if zero is given, the size will be the max of 10*datasetsize or C*datasetsize
131  /// \param[in] preInitializationMethod the method to preinitialize the budget.
132  /// \param[in] minMargin the margin every vector has to obey. Usually this is 1.
133  ///
135  const LossType* loss,
136  double C,
137  bool offset,
138  bool unconstrained = false,
139  size_t budgetSize = 500,
141  size_t epochs = 1,
142  size_t preInitializationMethod = NONE,
143  double minMargin = 1.0f)
144  : m_kernel(kernel)
145  , m_loss(loss)
146  , m_C(C)
147  , m_offset(offset)
148  , m_unconstrained(unconstrained)
151  , m_epochs(epochs)
154  {
155 
156  // check that the maintenance strategy is not null.
157  if(m_budgetMaintenanceStrategy == NULL)
158  throw(SHARKEXCEPTION("KernelBudgetedSGDTrainer: No budget maintenance strategy provided!"));
159  }
160 
161 
162  /// get budget size
163  /// \return budget size
164  ///
165  size_t budgetSize() const
166  {
167  return m_budgetSize;
168  }
169 
170 
171  /// set budget size
172  /// \param[in] budgetSize size of budget.
173  ///
174  void setBudgetSize(std::size_t budgetSize)
175  {
177  }
178 
179 
180  /// return pointer to the budget maintenance strategy
181  /// \return pointer to the budget maintenance strategy.
182  ///
184  {
186  }
187 
188 
189  /// set budget maintenance strategy
190  /// \param[in] budgetMaintenanceStrategy set strategy to given object.
191  ///
193  {
195  }
196 
197 
198  /// return min margin
199  /// \return current min margin
200  ///
201  double minMargin() const
202  {
203  return m_minMargin;
204  }
205 
206 
207  /// set min margin
208  /// \param[in] minMargin new min margin.
209  ///
210  void setMinMargin(double minMargin)
211  {
213  }
214 
215 
216  /// \brief From INameable: return the class name.
217  std::string name() const
218  {
219  return "KernelBudgetedSGDTrainer";
220  }
221 
222 
223  /// Train routine.
224  /// \param[in] classifier classifier object for the final solution.
225  /// \param[in] dataset dataset to work with.
226  ///
227  void train(ClassifierType &classifier, const LabeledData<InputType, unsigned int> &dataset)
228  {
229 
230  std::size_t ell = dataset.numberOfElements();
231  std::size_t classes = numberOfClasses(dataset);
232 
233  // is the budget size larger than reasonable?
234  if(m_budgetSize > ell)
235  {
236  // in this case we just set the budgetSize to the given dataset size, so basically
237  // there is an infinite budget.
238  m_budgetSize = ell;
239  }
240 
241  // we always need one budget vector more than the user specified,
242  // as we first have to add any new vector to the budget before applying
243  // the maintenance strategy. an alternative would be to keep the budget size
244  // correct and test explicitely for the new support vector, but that would
245  // create even more hassle on the other side. or one could use a vector of
246  // budget vectors instead, but loose the nice framework of kernel expansions.
247  // so the last budget vector must always have zero alpha coefficients in
248  // the final model. (we do not check for that but roughly assume that in
249  // the strategies, e.g. by putting the new vector to the last position in the
250  // merge strategy).
252 
253  // easy access
254  UIntVector y = createBatch(dataset.labels().elements());
255 
256  // create a preinitialized budget.
257  // this is used to initialize the kernelexpansion, we will work with.
258  LabeledData<InputType, unsigned int> preinitializedBudgetVectors(m_budgetSize, dataset.element(0));
259 
260  // preinit the vectors first
261  // we still preinit even for no preinit, as we need the vectors in the
262  // constructor of the kernelexpansion. the alphas will be set to zero for none.
264  {
265  for(size_t j = 0; j < m_budgetSize; j++)
266  {
267  // choose a random vector
268  std::size_t b = Rng::discrete(0, ell - 1);
269 
270  // copy over the vector
271  preinitializedBudgetVectors.element(j) = dataset.element(b);
272  }
273  }
274 
275  /*
276  // TODO: kmeans initialization
277  if (m_preInitializationMethod == KMEANS) {
278  // the negative examples individually. the number of clusters should
279  // then follow the ratio of the classes. then we can set the alphas easily.
280  // TODO: do this multiclass
281  // TODO: maybe Kmedoid makes more sense because of the alphas.
282  // TODO: allow for different maxiters
283  Centroids centroids;
284  size_t maxIterations = 50;
285  kMeans (dataset.inputs(), m_budgetSize, centroids, maxIterations);
286 
287  // copy over to our budget
288  Data<RealVector> const& c = centroids.centroids();
289 
290  for (size_t j = 0; j < m_budgetSize; j++) {
291  preinitializedBudgetVectors.inputs().element (j) = c.element (j);
292  preinitializedBudgetVectors.labels().element (j) = 1; //FIXME
293  }
294  }
295  */
296 
297  // budget is a kernel expansion in its own right
298  ModelType &budgetModel = classifier.decisionFunction();
299  RealMatrix &budgetAlpha = budgetModel.alpha();
300  budgetModel.setStructure(m_kernel, preinitializedBudgetVectors.inputs(), m_offset, classes);
301 
302 
303  // variables
304  const double lambda = 1.0 / (ell * m_C);
305  std::size_t iterations;
306 
307 
308  // set epoch number
309  if(m_epochs == 0)
310  iterations = std::max(10 * ell, std::size_t (std::ceil(m_C * ell)));
311  else
312  iterations = m_epochs * ell;
313 
314 
315  // set the initial alphas (we do this here, after the array has been initialized by setStructure)
316  if(m_preInitializationMethod == RANDOM)
317  {
318  for(size_t j = 0; j < m_budgetSize; j++)
319  {
320  size_t c = preinitializedBudgetVectors.labels().element(j);
321  budgetAlpha(j, c) = 1 / (1 + lambda);
322  budgetAlpha(j, (c + 1) % classes) = -1 / (1 + lambda);
323  }
324  }
325 
326 
327  // whatever strategy we did use-- the last budget vector needs
328  // to be zeroed out, either it was zero anyway (none preinit)
329  // or it is the extra budget vector we need for technical reasons
330  row(budgetAlpha, m_budgetSize - 1) *= 0;
331 
332 
333  // preinitialize everything to prevent costly memory allocations in the loop
334  RealVector predictions(classes, 0.0);
335  RealVector derivative(classes, 0.0);
336 
337 
338  // SGD loop
339  std::size_t b = 0;
340 
341  for(std::size_t iter = 0; iter < iterations; iter++)
342  {
343  // active variable
344  b = Rng::discrete(0, ell - 1);
345 
346  // for smaller datasets instead of choosing randomly a sample
347  // permuting the dataset can be a valid strategy. We do not implement
348  // that here.
349 
350  // compute prediction within the budgeted model
351  // this will compute the predictions for all classes in one step
352  budgetModel.eval(dataset.inputs().element(b), predictions);
353 
354  // now we follow the crammer-singer model as written
355  // in paper (p. 11 top), we compute the scores of the true
356  // class and the runner-up class. for the latter we remove
357  // our true prediction temporarily and redo the argmax.
358 
359  RealVector predictionsCopy = predictions;
360  unsigned int trueClass = y[b];
361  double scoreOfTrueClass = predictions[trueClass];
362  predictions[trueClass] = -std::numeric_limits<double>::infinity();
363  unsigned int runnerupClass = (unsigned int)arg_max(predictions);
364  double scoreOfRunnerupClass = predictions[runnerupClass];
365 
366  SHARK_ASSERT(trueClass != runnerupClass);
367 
368  // scale alphas
369  budgetModel.alpha() *= ((long double)(1.0 - 1.0 / (iter + 1.0)));
370 
371  // check if there is a margin violation
372  if(scoreOfTrueClass - scoreOfRunnerupClass < m_minMargin)
373  {
374  // TODO: check if the current vector is already part of our budget
375 
376  // as we do not use the predictions anymore, we use them to push the new alpha values
377  // to the budgeted model
378  predictions.clear();
379 
380  // set the alpha values (see p 11, beta_t^{(i)} formula in wang, crammer, vucetic)
381  // alpha of true class
382  predictions[trueClass] = 1.0 / ((long double)(iter + 1.0) * lambda);
383 
384  // alpha of runnerup class
385  predictions[runnerupClass] = -1.0 / ((long double)(iter + 1.0) * lambda);
386 
387  m_budgetMaintenanceStrategy->addToModel(budgetModel, predictions, dataset.element(b));
388  }
389  }
390 
391  // finally we need to get rid of zero supportvectors.
392  budgetModel.sparsify();
393 
394  }
395 
396  /// Return the number of training epochs.
397  /// A value of 0 indicates that the default of max(10, C) should be used.
398  std::size_t epochs() const
399  {
400  return m_epochs;
401  }
402 
403  /// Set the number of training epochs.
404  /// A value of 0 indicates that the default of max(10, C) should be used.
405  void setEpochs(std::size_t value)
406  {
407  m_epochs = value;
408  }
409 
410  /// get the kernel function
411  KernelType *kernel()
412  {
413  return m_kernel;
414  }
415  /// get the kernel function
416  const KernelType *kernel() const
417  {
418  return m_kernel;
419  }
420  /// set the kernel function
421  void setKernel(KernelType *kernel)
422  {
423  m_kernel = kernel;
424  }
425 
426  /// check whether the parameter C is represented as log(C), thus,
427  /// in a form suitable for unconstrained optimization, in the
428  /// parameter vector
429  bool isUnconstrained() const
430  {
431  return m_unconstrained;
432  }
433 
434  /// return the value of the regularization parameter
435  double C() const
436  {
437  return m_C;
438  }
439 
440  /// set the value of the regularization parameter (must be positive)
441  void setC(double value)
442  {
443  RANGE_CHECK(value > 0.0);
444  m_C = value;
445  }
446 
447  /// check whether the model to be trained should include an offset term
448  bool trainOffset() const
449  {
450  return m_offset;
451  }
452 
453  ///\brief Returns the vector of hyper-parameters.
454  RealVector parameterVector() const
455  {
456  size_t kp = m_kernel->numberOfParameters();
457  RealVector ret(kp + 1);
458 
459  if(m_unconstrained)
460  init(ret) << parameters(m_kernel), log(m_C);
461  else
462  init(ret) << parameters(m_kernel), m_C;
463 
464  return ret;
465  }
466 
467  ///\brief Sets the vector of hyper-parameters.
468  void setParameterVector(RealVector const &newParameters)
469  {
470  size_t kp = m_kernel->numberOfParameters();
471  SHARK_ASSERT(newParameters.size() == kp + 1);
472  init(newParameters) >> parameters(m_kernel), m_C;
473 
474  if(m_unconstrained) m_C = exp(m_C);
475  }
476 
477  ///\brief Returns the number of hyper-parameters.
478  size_t numberOfParameters() const
479  {
480  return m_kernel->numberOfParameters() + 1;
481  }
482 
483 protected:
484  KernelType *m_kernel; ///< pointer to kernel function
485  const LossType *m_loss; ///< pointer to loss function
486  double m_C; ///< regularization parameter
487  bool m_offset; ///< should the resulting model have an offset term?
488  bool m_unconstrained; ///< should C be stored as log(C) as a parameter?
489 
490  // budget size
491  std::size_t m_budgetSize;
492 
493  // budget maintenance strategy
495 
496  std::size_t m_epochs; ///< number of training epochs (sweeps over the data), or 0 for default = max(10, C)
497 
498  // method to preinitialize budget
500 
501  // needed margin below which we update the model, also called beta sometimes
502  double m_minMargin;
503 };
504 
505 }
506 #endif