QpMcLinear.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Quadratic programming solvers for linear multi-class SVM training without bias.
6  *
7  *
8  *
9  * \author T. Glasmachers
10  * \date -
11  *
12  *
13  * \par Copyright 1995-2015 Shark Development Team
14  *
15  * <BR><HR>
16  * This file is part of Shark.
17  * <http://image.diku.dk/shark/>
18  *
19  * Shark is free software: you can redistribute it and/or modify
20  * it under the terms of the GNU Lesser General Public License as published
21  * by the Free Software Foundation, either version 3 of the License, or
22  * (at your option) any later version.
23  *
24  * Shark is distributed in the hope that it will be useful,
25  * but WITHOUT ANY WARRANTY; without even the implied warranty of
26  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27  * GNU Lesser General Public License for more details.
28  *
29  * You should have received a copy of the GNU Lesser General Public License
30  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
31  *
32  */
33 //===========================================================================
34 
35 
36 #ifndef SHARK_ALGORITHMS_QP_QPMCLINEAR_H
37 #define SHARK_ALGORITHMS_QP_QPMCLINEAR_H
38 
39 #include <shark/Core/Timer.h>
41 #include <shark/Data/Dataset.h>
42 #include <shark/Data/DataView.h>
43 #include <shark/LinAlg/Base.h>
44 #include <cmath>
45 #include <iostream>
46 #include <vector>
47 
48 
49 namespace shark {
50 
51 
52 
53 // strategy constants
54 #define CHANGE_RATE 0.2
55 #define PREF_MIN 0.05
56 #define PREF_MAX 20.0
57 
58 // inner iteration limit
59 #define MAXITER_MULTIPLIER 10
60 
61 
62 /// \brief Generic solver skeleton for linear multi-class SVM problems.
63 template <class InputT>
65 {
66 public:
70 
72 
73 
74  ///
75  /// \brief Constructor
76  ///
77  /// \param dataset training data
78  /// \param dim problem dimension
79  /// \param classes number of classes in the problem
80  /// \param strategy coordinate selection strategy
81  /// \param shrinking flag turning shrinking on and off
82  ///
84  const DatasetType& dataset,
85  std::size_t dim,
86  std::size_t classes,
87  std::size_t strategy = ACF,
88  bool shrinking = false)
89  : m_data(dataset)
90  , m_xSquared(dataset.numberOfElements())
91  , m_dim(dim)
92  , m_classes(classes)
93  , m_strategy(strategy)
94  , m_shrinking(shrinking)
95  {
96  SHARK_ASSERT(m_dim > 0);
97 
98  for (std::size_t i=0; i<m_data.size(); i++)
99  {
100  m_xSquared(i) = inner_prod(m_data[i].input, m_data[i].input);
101  }
102  }
103 
104  ///
105  /// \brief Solve the SVM training problem.
106  ///
107  /// \param C regularization constant of the SVM
108  /// \param stop stopping condition(s)
109  /// \param prop solution properties
110  /// \param verbose if true, the solver prints status information and solution statistics
111  ///
112  RealMatrix solve(
113  double C,
114  QpStoppingCondition& stop,
115  QpSolutionProperties* prop = NULL,
116  bool verbose = false)
117  {
118  // sanity checks
119  SHARK_ASSERT(C > 0.0);
120 
121  // measure training time
122  Timer timer;
123  timer.start();
124 
125  // prepare dimensions and vectors
126  std::size_t ell = m_data.size(); // number of training examples
127  RealMatrix alpha(ell, m_classes + 1, 0.0); // Lagrange multipliers; dual variables. Reserve one extra column.
128  RealMatrix w(m_classes, m_dim, 0.0); // weight vectors; primal variables
129 
130  // scheduling of steps, for ACF only
131  RealVector pref(ell, 1.0); // example-wise measure of success
132  double prefsum = ell; // normalization constant
133 
134  std::vector<std::size_t> schedule(ell);
135  if (m_strategy == UNIFORM)
136  {
137  for (std::size_t i=0; i<ell; i++) schedule[i] = i;
138  }
139 
140  // used for shrinking
141  std::size_t active = ell;
142 
143  // prepare counters
144  std::size_t epoch = 0;
145  std::size_t steps = 0;
146 
147  // prepare performance monitoring
148  double objective = 0.0;
149  double max_violation = 0.0;
150 
151  // gain for ACF
152  const double gain_learning_rate = 1.0 / ell;
153  double average_gain = 0.0;
154 
155 
156  // outer optimization loop (epochs)
157  bool canstop = true;
158  while (true)
159  {
160  if (m_strategy == ACF)
161  {
162  // define schedule
163  double psum = prefsum;
164  prefsum = 0.0;
165  std::size_t pos = 0;
166  for (std::size_t i=0; i<ell; i++)
167  {
168  double p = pref(i);
169  double num = (psum < 1e-6) ? ell - pos : std::min((double)(ell - pos), (ell - pos) * p / psum);
170  std::size_t n = (std::size_t)std::floor(num);
171  double prob = num - n;
172  if (Rng::uni() < prob) n++;
173  for (std::size_t j=0; j<n; j++)
174  {
175  schedule[pos] = i;
176  pos++;
177  }
178  psum -= p;
179  prefsum += p;
180  }
181  SHARK_ASSERT(pos == ell);
182  }
183 
184  if (m_shrinking == true)
185  {
186  for (std::size_t i=0; i<active; i++)
187  std::swap(schedule[i], schedule[Rng::discrete(0, active - 1)]);
188  }
189  else
190  {
191  for (std::size_t i=0; i<ell; i++)
192  std::swap(schedule[i], schedule[Rng::discrete(0, ell - 1)]);
193  }
194 
195  // inner loop (one epoch)
196  max_violation = 0.0;
197  size_t nPoints = ell;
198  if (m_shrinking == true)
199  nPoints = active;
200 
201  for (std::size_t j=0; j<nPoints; j++)
202  {
203  // active example
204  double gain = 0.0;
205  const std::size_t i = schedule[j];
206  InputReferenceType x_i = m_data[i].input;
207  const unsigned int y_i = m_data[i].label;
208  const double q = m_xSquared(i);
209  RealMatrixRow a = row(alpha, i);
210 
211  // compute gradient and KKT violation
212  RealVector wx = prod(w,x_i);
213  RealVector g(m_classes);
214  double kkt = calcGradient(g, wx, a, C, y_i);
215 
216  if (kkt > 0.0)
217  {
218  max_violation = std::max(max_violation, kkt);
219 
220  // perform the step on alpha
221  RealVector mu(m_classes, 0.0);
222  gain = solveSub(0.1 * stop.minAccuracy, g, q, C, y_i, a, mu);
223  objective += gain;
224  steps++;
225 
226  // update weight vectors
227  updateWeightVectors(w, mu, i);
228  }
229  else if (m_shrinking == true)
230  {
231  active--;
232  std::swap(schedule[j], schedule[active]);
233  j--;
234  }
235 
236  // update gain-based preferences
237  if (m_strategy == ACF)
238  {
239  if (epoch == 0) average_gain += gain / (double)ell;
240  else
241  {
242  double change = CHANGE_RATE * (gain / average_gain - 1.0);
243  double newpref = std::min(PREF_MAX, std::max(PREF_MIN, pref(i) * std::exp(change)));
244  prefsum += newpref - pref(i);
245  pref(i) = newpref;
246  average_gain = (1.0 - gain_learning_rate) * average_gain + gain_learning_rate * gain;
247  }
248  }
249  }
250 
251  epoch++;
252 
253  // stopping criteria
254  if (stop.maxIterations > 0 && epoch * ell >= stop.maxIterations)
255  {
256  if (prop != NULL) prop->type = QpMaxIterationsReached;
257  break;
258  }
259 
260  if (timer.stop() >= stop.maxSeconds)
261  {
262  if (prop != NULL) prop->type = QpTimeout;
263  break;
264  }
265 
266  if (max_violation < stop.minAccuracy)
267  {
268  if (verbose)
269  std::cout << "#" << std::flush;
270  if (canstop)
271  {
272  if (prop != NULL) prop->type = QpAccuracyReached;
273  break;
274  }
275  else
276  {
277  if (m_strategy == ACF)
278  {
279  // prepare full sweep for a reliable checking of the stopping criterion
280  canstop = true;
281  for (std::size_t i=0; i<ell; i++) pref(i) = 1.0;
282  prefsum = ell;
283  }
284 
285  if (m_shrinking == true)
286  {
287  // prepare full sweep for a reliable checking of the stopping criterion
288  active = ell;
289  canstop = true;
290  }
291  }
292  }
293  else
294  {
295  if (verbose) std::cout << "." << std::flush;
296  if (m_strategy == ACF)
297  canstop = false;
298  if (m_shrinking == true)
299  canstop = (active == ell);
300  }
301  }
302  timer.stop();
303 
304  // calculate dual objective value
305  objective = 0.0;
306  for (std::size_t j=0; j<m_classes; j++)
307  {
308  for (std::size_t d=0; d<m_dim; d++) objective -= w(j, d) * w(j, d);
309  }
310  objective *= 0.5;
311  for (std::size_t i=0; i<ell; i++)
312  {
313  for (std::size_t j=0; j<m_classes; j++) objective += alpha(i, j);
314  }
315 
316  // return solution statistics
317  if (prop != NULL)
318  {
319  prop->accuracy = max_violation; // this is approximate, but a good guess
320  prop->iterations = ell * epoch;
321  prop->value = objective;
322  prop->seconds = timer.lastLap();
323  }
324 
325  // output solution statistics
326  if (verbose)
327  {
328  std::cout << std::endl;
329  std::cout << "training time (seconds): " << timer.lastLap() << std::endl;
330  std::cout << "number of epochs: " << epoch << std::endl;
331  std::cout << "number of iterations: " << (ell * epoch) << std::endl;
332  std::cout << "number of non-zero steps: " << steps << std::endl;
333  std::cout << "dual accuracy: " << max_violation << std::endl;
334  std::cout << "dual objective value: " << objective << std::endl;
335  }
336 
337  // return the solution
338  return w;
339  }
340 
341 protected:
342  // for all c: row(w, c) += mu(c) * x
343  void add_scaled(RealMatrix& w, RealVector const& mu, InputReferenceType x)
344  {
345  for (std::size_t c=0; c<m_classes; c++) noalias(row(w, c)) += mu(c) * x;
346  }
347 
348  /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
349  ///
350  /// \param gradient gradient vector to be filled in. The vector is correctly sized.
351  /// \param wx inner products of weight vectors with the current sample; wx(c) = <w_c, x>
352  /// \param alpha variables corresponding to the current sample
353  /// \param C upper bound on the variables
354  /// \param y label of the current sample
355  ///
356  /// \return The function must return the violation of the KKT conditions.
357  virtual double calcGradient(RealVector& gradient, RealVector wx, RealMatrixRow const& alpha, double C, unsigned int y) = 0;
358 
359  /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
360  ///
361  /// \param w matrix of (dense) weight vectors (as rows)
362  /// \param mu dual step on the variables corresponding to the current sample
363  /// \param index current sample
364  virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index) = 0;
365 
366  /// \brief Solve the sub-problem posed by a single training example.
367  ///
368  /// \param epsilon accuracy (dual gradient) up to which the sub-problem should be solved
369  /// \param gradient gradient of the objective function w.r.t. alpha
370  /// \param q squared norm of the current sample
371  /// \param C upper bound on the variables
372  /// \param y label of the current sample
373  /// \param alpha input: initial point; output: (near) optimal point
374  /// \param mu step from initial point to final point
375  ///
376  /// \return The function must return the gain of the step, i.e., the improvement of the objective function.
377  virtual double solveSub(double epsilon, RealVector gradient, double q, double C, unsigned int y, RealMatrixRow& alpha, RealVector& mu) = 0;
378 
379  DataView<const DatasetType> m_data; ///< view on training data
380  RealVector m_xSquared; ///< diagonal entries of the quadratic matrix
381  std::size_t m_dim; ///< input space dimension
382  std::size_t m_classes; ///< number of classes
383  std::size_t m_strategy; ///< strategy for coordinate selection
384  bool m_shrinking; ///< apply shrinking or not?
385 };
386 
387 
388 //~ /// \brief Generic solver skeleton for linear multi-class SVM problems.
389 //~ template < >
390 //~ class QpMcLinear<CompressedRealVector>
391 //~ {
392 //~ public:
393  //~ typedef LabeledData<CompressedRealVector, unsigned int> DatasetType;
394 
395  //~ ///
396  //~ /// \brief Constructor
397  //~ ///
398  //~ /// \param dataset training data
399  //~ /// \param dim problem dimension
400  //~ /// \param classes number of classes in the problem
401  //~ ///
402  //~ QpMcLinear(
403  //~ const DatasetType& dataset,
404  //~ std::size_t dim,
405  //~ std::size_t classes)
406  //~ : m_data(dataset.numberOfElements())
407  //~ , m_xSquared(dataset.numberOfElements())
408  //~ , m_dim(dim)
409  //~ , m_classes(classes)
410  //~ {
411  //~ SHARK_ASSERT(m_dim > 0);
412 
413  //~ // transform ublas sparse vectors into a fast format
414  //~ // (yes, ublas is slow...), and compute the squared
415  //~ // norms of the training examples
416  //~ SparseVector sparse;
417  //~ for (std::size_t b=0, j=0; b<dataset.numberOfBatches(); b++)
418  //~ {
419  //~ DatasetType::const_batch_reference batch = dataset.batch(b);
420  //~ for (std::size_t i=0; i<batch.size(); i++)
421  //~ {
422  //~ CompressedRealVector x_i = shark::get(batch, i).input;
423  //~ unsigned int y_i = shark::get(batch, i).label;
424  //~ m_data[j].label = y_i;
425  //~ double d = 0.0;
426  //~ for (CompressedRealVector::const_iterator it=x_i.begin(); it != x_i.end(); ++it)
427  //~ {
428  //~ double v = *it;
429  //~ sparse.index = it.index();
430  //~ sparse.value = v;
431  //~ storage.push_back(sparse);
432  //~ d += v * v;
433  //~ }
434  //~ sparse.index = (std::size_t)-1;
435  //~ storage.push_back(sparse);
436  //~ m_xSquared(j) = d;
437  //~ j++;
438  //~ }
439  //~ }
440  //~ for (std::size_t i=0, k=0; i<m_data.size(); i++)
441  //~ {
442  //~ CompressedRealVector x_i = dataset.element(i).input;
443  //~ m_data[i].input = &storage[k];
444  //~ for (; storage[k].index != (std::size_t)-1; k++);
445  //~ k++;
446  //~ }
447  //~ }
448 
449  //~ ///
450  //~ /// \brief Solve the SVM training problem.
451  //~ ///
452  //~ /// \param C regularization constant of the SVM
453  //~ /// \param stop stopping condition(s)
454  //~ /// \param prop solution properties
455  //~ /// \param verbose if true, the solver prints status information and solution statistics
456  //~ ///
457  //~ RealMatrix solve(
458  //~ double C,
459  //~ QpStoppingCondition& stop,
460  //~ QpSolutionProperties* prop = NULL,
461  //~ bool verbose = false)
462  //~ {
463  //~ // sanity checks
464  //~ SHARK_ASSERT(C > 0.0);
465 
466  //~ // measure training time
467  //~ Timer timer;
468 
469  //~ // prepare dimensions and vectors
470  //~ std::size_t ell = m_data.size(); // number of training examples
471  //~ RealMatrix alpha(ell, m_classes + 1, 0.0); // Lagrange multipliers; dual variables. Reserve one extra column.
472  //~ RealMatrix w(m_classes, m_dim, 0.0); // weight vectors; primal variables
473 
474  //~ // scheduling of steps
475  //~ RealVector pref(ell, 1.0); // example-wise measure of success
476  //~ double prefsum = ell; // normalization constant
477  //~ std::vector<std::size_t> schedule(ell);
478 
479  //~ // prepare counters
480  //~ std::size_t epoch = 0;
481  //~ std::size_t steps = 0;
482 
483  //~ // prepare performance monitoring
484  //~ double objective = 0.0;
485  //~ double max_violation = 0.0;
486  //~ const double gain_learning_rate = 1.0 / ell;
487  //~ double average_gain = 0.0;
488 
489  //~ // outer optimization loop (epochs)
490  //~ bool canstop = true;
491  //~ while (true)
492  //~ {
493  //~ // define schedule
494  //~ double psum = prefsum;
495  //~ prefsum = 0.0;
496  //~ std::size_t pos = 0;
497  //~ for (std::size_t i=0; i<ell; i++)
498  //~ {
499  //~ double p = pref(i);
500  //~ double num = (psum < 1e-6) ? ell - pos : std::min((double)(ell - pos), (ell - pos) * p / psum);
501  //~ std::size_t n = (std::size_t)std::floor(num);
502  //~ double prob = num - n;
503  //~ if (Rng::uni() < prob) n++;
504  //~ for (std::size_t j=0; j<n; j++)
505  //~ {
506  //~ schedule[pos] = i;
507  //~ pos++;
508  //~ }
509  //~ psum -= p;
510  //~ prefsum += p;
511  //~ }
512  //~ SHARK_ASSERT(pos == ell);
513  //~ for (std::size_t i=0; i<ell; i++) std::swap(schedule[i], schedule[Rng::discrete(0, ell - 1)]);
514 
515  //~ // inner loop (one epoch)
516  //~ max_violation = 0.0;
517  //~ for (std::size_t j=0; j<ell; j++)
518  //~ {
519  //~ // active example
520  //~ double gain = 0.0;
521  //~ const std::size_t i = schedule[j];
522  //~ const SparseVector* x_i = m_data[i].input;
523  //~ const unsigned int y_i = m_data[i].label;
524  //~ const double q = m_xSquared(i);
525  //~ RealMatrixRow a = row(alpha, i);
526 
527  //~ // compute gradient and KKT violation
528  //~ RealVector wx(m_classes, 0.0);
529  //~ for (const SparseVector* p=x_i; p->index != (std::size_t)-1; p++)
530  //~ {
531  //~ const std::size_t idx = p->index;
532  //~ const double v = p->value;
533  //~ for (size_t c=0; c<m_classes; c++) wx(c) += w(c, idx) * v;
534  //~ }
535  //~ RealVector g(m_classes);
536  //~ double kkt = calcGradient(g, wx, a, C, y_i);
537 
538  //~ if (kkt > 0.0)
539  //~ {
540  //~ max_violation = std::max(max_violation, kkt);
541 
542  //~ // perform the step on alpha
543  //~ RealVector mu(m_classes, 0.0);
544  //~ gain = solveSub(0.1 * stop.minAccuracy, g, q, C, y_i, a, mu);
545  //~ objective += gain;
546  //~ steps++;
547 
548  //~ // update weight vectors
549  //~ updateWeightVectors(w, mu, i);
550  //~ }
551 
552  //~ // update gain-based preferences
553  //~ {
554  //~ if (epoch == 0) average_gain += gain / (double)ell;
555  //~ else
556  //~ {
557  //~ double change = CHANGE_RATE * (gain / average_gain - 1.0);
558  //~ double newpref = std::min(PREF_MAX, std::max(PREF_MIN, pref(i) * std::exp(change)));
559  //~ prefsum += newpref - pref(i);
560  //~ pref(i) = newpref;
561  //~ average_gain = (1.0 - gain_learning_rate) * average_gain + gain_learning_rate * gain;
562  //~ }
563  //~ }
564  //~ }
565 
566  //~ epoch++;
567 
568  //~ // stopping criteria
569  //~ if (stop.maxIterations > 0 && epoch * ell >= stop.maxIterations)
570  //~ {
571  //~ if (prop != NULL) prop->type = QpMaxIterationsReached;
572  //~ break;
573  //~ }
574 
575  //~ if (timer.stop() >= stop.maxSeconds)
576  //~ {
577  //~ if (prop != NULL) prop->type = QpTimeout;
578  //~ break;
579  //~ }
580 
581  //~ if (max_violation < stop.minAccuracy)
582  //~ {
583  //~ if (verbose) std::cout << "#" << std::flush;
584  //~ if (canstop)
585  //~ {
586  //~ if (prop != NULL) prop->type = QpAccuracyReached;
587  //~ break;
588  //~ }
589  //~ else
590  //~ {
591  //~ // prepare full sweep for a reliable checking of the stopping criterion
592  //~ canstop = true;
593  //~ for (std::size_t i=0; i<ell; i++) pref(i) = 1.0;
594  //~ prefsum = ell;
595  //~ }
596  //~ }
597  //~ else
598  //~ {
599  //~ if (verbose) std::cout << "." << std::flush;
600  //~ canstop = false;
601  //~ }
602  //~ }
603  //~ timer.stop();
604 
605  //~ // calculate dual objective value
606  //~ objective = 0.0;
607  //~ for (std::size_t j=0; j<m_classes; j++)
608  //~ {
609  //~ for (std::size_t d=0; d<m_dim; d++) objective -= w(j, d) * w(j, d);
610  //~ }
611  //~ objective *= 0.5;
612  //~ for (std::size_t i=0; i<ell; i++)
613  //~ {
614  //~ for (std::size_t j=0; j<m_classes; j++) objective += alpha(i, j);
615  //~ }
616 
617  //~ // return solution statistics
618  //~ if (prop != NULL)
619  //~ {
620  //~ prop->accuracy = max_violation; // this is approximate, but a good guess
621  //~ prop->iterations = ell * epoch;
622  //~ prop->value = objective;
623  //~ prop->seconds = timer.lastLap();
624  //~ }
625 
626  //~ // output solution statistics
627  //~ if (verbose)
628  //~ {
629  //~ std::cout << std::endl;
630  //~ std::cout << "training time (seconds): " << timer.lastLap() << std::endl;
631  //~ std::cout << "number of epochs: " << epoch << std::endl;
632  //~ std::cout << "number of iterations: " << (ell * epoch) << std::endl;
633  //~ std::cout << "number of non-zero steps: " << steps << std::endl;
634  //~ std::cout << "dual accuracy: " << max_violation << std::endl;
635  //~ std::cout << "dual objective value: " << objective << std::endl;
636  //~ }
637 
638  //~ // return the solution
639  //~ return w;
640  //~ }
641 
642 //~ protected:
643  //~ /// \brief Data structure for sparse vectors.
644  //~ struct SparseVector
645  //~ {
646  //~ std::size_t index;
647  //~ double value;
648  //~ };
649 
650  //~ struct ElementType
651  //~ {
652  //~ const SparseVector* input;
653  //~ unsigned int label;
654  //~ };
655 
656  //~ // for all c: row(w, c) += mu(c) * x
657  //~ void add_scaled(RealMatrix& w, RealVector const& mu, const SparseVector* x)
658  //~ {
659  //~ for (; x->index != (std::size_t)-1; x++)
660  //~ {
661  //~ const std::size_t index = x->index;
662  //~ const double value = x->value;
663  //~ for (std::size_t c=0; c<m_classes; c++) w(c, index) += mu(c) * value;
664  //~ }
665  //~ }
666 
667  //~ /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
668  //~ ///
669  //~ /// \param gradient gradient vector to be filled in. The vector is correctly sized.
670  //~ /// \param wx inner products of weight vectors with the current sample; wx(c) = <w_c, x>
671  //~ /// \param alpha variables corresponding to the current sample
672  //~ /// \param C upper bound on the variables
673  //~ /// \param y label of the current sample
674  //~ ///
675  //~ /// \return The function must return the violation of the KKT conditions.
676  //~ virtual double calcGradient(RealVector& gradient, RealVector wx, RealMatrixRow const& alpha, double C, unsigned int y) = 0;
677 
678  //~ /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
679  //~ ///
680  //~ /// \param w matrix of (dense) weight vectors (as rows)
681  //~ /// \param mu dual step on the variables corresponding to the current sample
682  //~ /// \param index example index
683  //~ virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index) = 0;
684 
685  //~ /// \brief Solve the sub-problem posed by a single training example.
686  //~ ///
687  //~ /// \param epsilon accuracy (dual gradient) up to which the sub-problem should be solved
688  //~ /// \param gradient gradient of the objective function w.r.t. alpha
689  //~ /// \param q squared norm of the current sample
690  //~ /// \param C upper bound on the variables
691  //~ /// \param y label of the current sample
692  //~ /// \param alpha input: initial point; output: (near) optimal point
693  //~ /// \param mu step from initial point to final point
694  //~ ///
695  //~ /// \return The function must return the gain of the step, i.e., the improvement of the objective function.
696  //~ virtual double solveSub(double epsilon, RealVector gradient, double q, double C, unsigned int y, RealMatrixRow& alpha, RealVector& mu) = 0;
697 
698  //~ std::vector<SparseVector> storage; ///< storage for sparse vectors
699  //~ std::vector<ElementType> m_data; ///< resembles data view interface
700  //~ RealVector m_xSquared; ///< squared norms of the training data
701  //~ std::size_t m_dim; ///< input space dimension
702  //~ std::size_t m_classes; ///< number of classes
703 //~ };
704 
705 
706 /// \brief Solver for the multi-class SVM by Weston & Watkins.
707 template <class InputT>
708 class QpMcLinearWW : public QpMcLinear<InputT>
709 {
710 public:
712 
713  /// \brief Constructor
715  const DatasetType& dataset,
716  std::size_t dim,
717  std::size_t classes)
718  : QpMcLinear<InputT>(dataset, dim, classes)
719  { }
720 
721 protected:
722  /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
723  virtual double calcGradient(RealVector& gradient, RealVector wx, RealMatrixRow const& alpha, double C, unsigned int y)
724  {
725  double violation = 0.0;
726  for (std::size_t c=0; c<wx.size(); c++)
727  {
728  if (c == y)
729  {
730  gradient(c) = 0.0;
731  }
732  else
733  {
734  const double g = 1.0 - 0.5 * (wx(y) - wx(c));
735  gradient(c) = g;
736  if (g > violation && alpha(c) < C) violation = g;
737  else if (-g > violation && alpha(c) > 0.0) violation = -g;
738  }
739  }
740  return violation;
741  }
742 
743  /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
744  virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index)
745  {
746  double sum_mu = 0.0;
747  for (std::size_t c=0; c<m_classes; c++) sum_mu += mu(c);
748  unsigned int y = m_data[index].label;
749  RealVector step(-0.5 * mu);
750  step(y) = 0.5 * sum_mu;
751  add_scaled(w, step, m_data[index].input);
752  }
753 
754  /// \brief Solve the sub-problem posed by a single training example.
755  virtual double solveSub(double epsilon, RealVector gradient, double q, double C, unsigned int y, RealMatrixRow& alpha, RealVector& mu)
756  {
757  const double qq = 0.5 * q;
758  double gain = 0.0;
759 
760  // SMO loop
761  size_t iter, maxiter = MAXITER_MULTIPLIER * m_classes;
762  for (iter=0; iter<maxiter; iter++)
763  {
764  // select working set
765  std::size_t idx = 0;
766  double kkt = 0.0;
767  for (std::size_t c=0; c<m_classes; c++)
768  {
769  if (c == y) continue;
770 
771  const double g = gradient(c);
772  const double a = alpha(c);
773  if (g > kkt && a < C) { kkt = g; idx = c; }
774  else if (-g > kkt && a > 0.0) { kkt = -g; idx = c; }
775  }
776 
777  // check stopping criterion
778  if (kkt < epsilon) break;
779 
780  // perform step
781  const double a = alpha(idx);
782  const double g = gradient(idx);
783  double m = g / qq;
784  double a_new = a + m;
785  if (a_new <= 0.0)
786  {
787  m = -a;
788  a_new = 0.0;
789  }
790  else if (a_new >= C)
791  {
792  m = C - a;
793  a_new = C;
794  }
795  alpha(idx) = a_new;
796  mu(idx) += m;
797 
798  // update gradient and total gain
799  const double dg = 0.5 * m * qq;
800  for (std::size_t c=0; c<m_classes; c++) gradient(c) -= dg;
801  gradient(idx) -= dg;
802 
803  gain += m * (g - dg);
804  }
805 
806  return gain;
807  }
808 
809 protected:
813 };
814 
815 
816 /// \brief Solver for the multi-class SVM by Lee, Lin & Wahba.
817 template <class InputT>
818 class QpMcLinearLLW : public QpMcLinear<InputT>
819 {
820 public:
822 
823  /// \brief Constructor
825  const DatasetType& dataset,
826  std::size_t dim,
827  std::size_t classes)
828  : QpMcLinear<InputT>(dataset, dim, classes)
829  { }
830 
831 protected:
832  /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
833  virtual double calcGradient(RealVector& gradient, RealVector wx, RealMatrixRow const& alpha, double C, unsigned int y)
834  {
835  double violation = 0.0;
836  for (std::size_t c=0; c<m_classes; c++)
837  {
838  if (c == y)
839  {
840  gradient(c) = 0.0;
841  }
842  else
843  {
844  const double g = 1.0 + wx(c);
845  gradient(c) = g;
846  if (g > violation && alpha(c) < C) violation = g;
847  else if (-g > violation && alpha(c) > 0.0) violation = -g;
848  }
849  }
850  return violation;
851  }
852 
853  /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
854  virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index)
855  {
856  double mean_mu = 0.0;
857  for (std::size_t c=0; c<m_classes; c++) mean_mu += mu(c);
858  mean_mu /= (double)m_classes;
859  RealVector step(m_classes);
860  for (std::size_t c=0; c<m_classes; c++) step(c) = mean_mu - mu(c);
861  add_scaled(w, step, m_data[index].input);
862  }
863 
864  /// \brief Solve the sub-problem posed by a single training example.
865  virtual double solveSub(double epsilon, RealVector gradient, double q, double C, unsigned int y, RealMatrixRow& alpha, RealVector& mu)
866  {
867  const double ood = 1.0 / m_classes;
868  const double qq = (1.0 - ood) * q;
869  double gain = 0.0;
870 
871  // SMO loop
872  size_t iter, maxiter = MAXITER_MULTIPLIER * m_classes;
873  for (iter=0; iter<maxiter; iter++)
874  {
875  // select working set
876  std::size_t idx = 0;
877  double kkt = 0.0;
878  for (std::size_t c=0; c<m_classes; c++)
879  {
880  if (c == y) continue;
881 
882  const double g = gradient(c);
883  const double a = alpha(c);
884  if (g > kkt && a < C) { kkt = g; idx = c; }
885  else if (-g > kkt && a > 0.0) { kkt = -g; idx = c; }
886  }
887 
888  // check stopping criterion
889  if (kkt < epsilon) break;
890 
891  // perform step
892  const double a = alpha(idx);
893  const double g = gradient(idx);
894  double m = g / qq;
895  double a_new = a + m;
896  if (a_new <= 0.0)
897  {
898  m = -a;
899  a_new = 0.0;
900  }
901  else if (a_new >= C)
902  {
903  m = C - a;
904  a_new = C;
905  }
906  alpha(idx) = a_new;
907  mu(idx) += m;
908 
909  // update gradient and total gain
910  const double dg = m * q;
911  const double dgc = dg / m_classes;
912  for (std::size_t c=0; c<m_classes; c++) gradient(c) += dgc;
913  gradient(idx) -= dg;
914 
915  gain += m * (g - 0.5 * (dg - dgc));
916  }
917 
918  return gain;
919  }
920 
921 protected:
925 };
926 
927 
928 /// \brief Solver for the multi-class SVM with absolute margin and total sum loss.
929 template <class InputT>
930 class QpMcLinearATS : public QpMcLinear<InputT>
931 {
932 public:
934 
935  /// \brief Constructor
937  const DatasetType& dataset,
938  std::size_t dim,
939  std::size_t classes)
940  : QpMcLinear<InputT>(dataset, dim, classes)
941  { }
942 
943 protected:
944  /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
945  virtual double calcGradient(RealVector& gradient, RealVector wx, RealMatrixRow const& alpha, double C, unsigned int y)
946  {
947  double violation = 0.0;
948  for (std::size_t c=0; c<m_classes; c++)
949  {
950  const double g = (c == y) ? 1.0 - wx(y) : 1.0 + wx(c);
951  gradient(c) = g;
952  if (g > violation && alpha(c) < C) violation = g;
953  else if (-g > violation && alpha(c) > 0.0) violation = -g;
954  }
955  return violation;
956  }
957 
958  /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
959  virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index)
960  {
961  unsigned int y = m_data[index].label;
962  double mean = -2.0 * mu(y);
963  for (std::size_t c=0; c<m_classes; c++) mean += mu(c);
964  mean /= (double)m_classes;
965  RealVector step(m_classes);
966  for (std::size_t c=0; c<m_classes; c++) step(c) = ((c == y) ? (mu(c) + mean) : (mean - mu(c)));
967  add_scaled(w, step, m_data[index].input);
968  }
969 
970  /// \brief Solve the sub-problem posed by a single training example.
971  virtual double solveSub(double epsilon, RealVector gradient, double q, double C, unsigned int y, RealMatrixRow& alpha, RealVector& mu)
972  {
973  const double ood = 1.0 / m_classes;
974  const double qq = (1.0 - ood) * q;
975  double gain = 0.0;
976 
977  // SMO loop
978  size_t iter, maxiter = MAXITER_MULTIPLIER * m_classes;
979  for (iter=0; iter<maxiter; iter++)
980  {
981  // select working set
982  std::size_t idx = 0;
983  double kkt = 0.0;
984  for (std::size_t c=0; c<m_classes; c++)
985  {
986  const double g = gradient(c);
987  const double a = alpha(c);
988  if (g > kkt && a < C) { kkt = g; idx = c; }
989  else if (-g > kkt && a > 0.0) { kkt = -g; idx = c; }
990  }
991 
992  // check stopping criterion
993  if (kkt < epsilon) break;
994 
995  // perform step
996  const double a = alpha(idx);
997  const double g = gradient(idx);
998  double m = g / qq;
999  double a_new = a + m;
1000  if (a_new <= 0.0)
1001  {
1002  m = -a;
1003  a_new = 0.0;
1004  }
1005  else if (a_new >= C)
1006  {
1007  m = C - a;
1008  a_new = C;
1009  }
1010  alpha(idx) = a_new;
1011  mu(idx) += m;
1012 
1013  // update gradient and total gain
1014  const double dg = m * q;
1015  const double dgc = dg / m_classes;
1016  if (idx == y)
1017  {
1018  for (std::size_t c=0; c<m_classes; c++) gradient(c) -= dgc;
1019  gradient(idx) -= dg - 2.0 * dgc;
1020  }
1021  else
1022  {
1023  for (std::size_t c=0; c<m_classes; c++) gradient(c) += (c == y) ? -dgc : dgc;
1024  gradient(idx) -= dg;
1025  }
1026 
1027  gain += m * (g - 0.5 * (dg - dgc));
1028  }
1029 
1030  return gain;
1031  }
1032 
1033 protected:
1037 };
1038 
1039 
1040 /// \brief Solver for the multi-class maximum margin regression SVM
1041 template <class InputT>
1042 class QpMcLinearMMR : public QpMcLinear<InputT>
1043 {
1044 public:
1046 
1047  /// \brief Constructor
1049  const DatasetType& dataset,
1050  std::size_t dim,
1051  std::size_t classes)
1052  : QpMcLinear<InputT>(dataset, dim, classes)
1053  { }
1054 
1055 protected:
1056  /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
1057  virtual double calcGradient(RealVector& gradient, RealVector wx, RealMatrixRow const& alpha, double C, unsigned int y)
1058  {
1059  for (std::size_t c=0; c<m_classes; c++) gradient(c) = 0.0;
1060  const double g = 1.0 - wx(y);
1061  gradient(y) = g;
1062  const double a = alpha(0);
1063  if (g > 0.0)
1064  {
1065  if (a == C) return 0.0;
1066  else return g;
1067  }
1068  else
1069  {
1070  if (a == 0.0) return 0.0;
1071  else return -g;
1072  }
1073  }
1074 
1075  /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
1076  virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index)
1077  {
1078  unsigned int y = m_data[index].label;
1079  double s = mu(0);
1080  double sc = -s / m_classes;
1081  double sy = s + sc;
1082  RealVector step(m_classes);
1083  for (size_t c=0; c<m_classes; c++) step(c) = (c == y) ? sy : sc;
1084  add_scaled(w, step, m_data[index].input);
1085  }
1086 
1087  /// \brief Solve the sub-problem posed by a single training example.
1088  virtual double solveSub(double epsilon, RealVector gradient, double q, double C, unsigned int y, RealMatrixRow& alpha, RealVector& mu)
1089  {
1090  const double ood = 1.0 / m_classes;
1091  const double qq = (1.0 - ood) * q;
1092 
1093  double kkt = 0.0;
1094  const double g = gradient(y);
1095  const double a = alpha(0);
1096  if (g > kkt && a < C) kkt = g;
1097  else if (-g > kkt && a > 0.0) kkt = -g;
1098 
1099  // check stopping criterion
1100  if (kkt < epsilon) return 0.0;
1101 
1102  // perform step
1103  double m = g / qq;
1104  double a_new = a + m;
1105  if (a_new <= 0.0)
1106  {
1107  m = -a;
1108  a_new = 0.0;
1109  }
1110  else if (a_new >= C)
1111  {
1112  m = C - a;
1113  a_new = C;
1114  }
1115  alpha(0) = a_new;
1116  mu(0) = m;
1117 
1118  // return the gain
1119  return m * (g - 0.5 * m * qq);
1120  }
1121 
1122 protected:
1126 };
1127 
1128 
1129 /// \brief Solver for the multi-class SVM by Crammer & Singer.
1130 template <class InputT>
1131 class QpMcLinearCS : public QpMcLinear<InputT>
1132 {
1133 public:
1135 
1136  /// \brief Constructor
1138  const DatasetType& dataset,
1139  std::size_t dim,
1140  std::size_t classes)
1141  : QpMcLinear<InputT>(dataset, dim, classes)
1142  { }
1143 
1144 protected:
1145  /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
1146  virtual double calcGradient(RealVector& gradient, RealVector wx, RealMatrixRow const& alpha, double C, unsigned int y)
1147  {
1148  if (alpha(m_classes) < C)
1149  {
1150  double violation = 0.0;
1151  for (std::size_t c=0; c<wx.size(); c++)
1152  {
1153  if (c == y)
1154  {
1155  gradient(c) = 0.0;
1156  }
1157  else
1158  {
1159  const double g = 1.0 - 0.5 * (wx(y) - wx(c));
1160  gradient(c) = g;
1161  if (g > violation) violation = g;
1162  else if (-g > violation && alpha(c) > 0.0) violation = -g;
1163  }
1164  }
1165  return violation;
1166  }
1167  else
1168  {
1169  // double kkt_up = -1e100, kkt_down = 1e100;
1170  double kkt_up = 0.0, kkt_down = 1e100;
1171  for (std::size_t c=0; c<m_classes; c++)
1172  {
1173  if (c == y)
1174  {
1175  gradient(c) = 0.0;
1176  }
1177  else
1178  {
1179  const double g = 1.0 - 0.5 * (wx(y) - wx(c));
1180  gradient(c) = g;
1181  if (g > kkt_up && alpha(c) < C) kkt_up = g;
1182  if (g < kkt_down && alpha(c) > 0.0) kkt_down = g;
1183  }
1184  }
1185  return std::max(0.0, kkt_up - kkt_down);
1186  }
1187  }
1188 
1189  /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
1190  virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index)
1191  {
1192  unsigned int y = m_data[index].label;
1193  double sum_mu = 0.0;
1194  for (std::size_t c=0; c<m_classes; c++) if (c != y) sum_mu += mu(c);
1195  RealVector step(-0.5 * mu);
1196  step(y) = 0.5 * sum_mu;
1197  add_scaled(w, step, m_data[index].input);
1198  }
1199 
1200  /// \brief Solve the sub-problem posed by a single training example.
1201  virtual double solveSub(double epsilon, RealVector gradient, double q, double C, unsigned int y, RealMatrixRow& alpha, RealVector& mu)
1202  {
1203  const double qq = 0.5 * q;
1204  double gain = 0.0;
1205 
1206  // SMO loop
1207  size_t iter, maxiter = MAXITER_MULTIPLIER * m_classes;
1208  for (iter=0; iter<maxiter; iter++)
1209  {
1210  // select working set
1211  std::size_t idx = 0;
1212  std::size_t idx_up = 0, idx_down = 0;
1213  bool size2 = false;
1214  double kkt = 0.0;
1215  double grad = 0.0;
1216  if (alpha(m_classes) == C)
1217  {
1218  double kkt_up = -1e100, kkt_down = 1e100;
1219  for (std::size_t c=0; c<m_classes; c++)
1220  {
1221  if (c == y) continue;
1222 
1223  const double g = gradient(c);
1224  const double a = alpha(c);
1225  if (g > kkt_up && a < C) { kkt_up = g; idx_up = c; }
1226  if (g < kkt_down && a > 0.0) { kkt_down = g; idx_down = c; }
1227  }
1228 
1229  if (kkt_up <= 0.0)
1230  {
1231  idx = idx_down;
1232  grad = kkt_down;
1233  kkt = -kkt_down;
1234  }
1235  else
1236  {
1237  grad = kkt_up - kkt_down;
1238  kkt = grad;
1239  size2 = true;
1240  }
1241  }
1242  else
1243  {
1244  for (std::size_t c=0; c<m_classes; c++)
1245  {
1246  if (c == y) continue;
1247 
1248  const double g = gradient(c);
1249  const double a = alpha(c);
1250  if (g > kkt) { kkt = g; idx = c; }
1251  else if (-g > kkt && a > 0.0) { kkt = -g; idx = c; }
1252  }
1253  grad = gradient(idx);
1254  }
1255 
1256  // check stopping criterion
1257  if (kkt < epsilon) return gain;
1258 
1259  if (size2)
1260  {
1261  // perform step
1262  const double a_up = alpha(idx_up);
1263  const double a_down = alpha(idx_down);
1264  double m = grad / qq;
1265  double a_up_new = a_up + m;
1266  double a_down_new = a_down - m;
1267  if (a_down_new <= 0.0)
1268  {
1269  m = a_down;
1270  a_up_new = a_up + m;
1271  a_down_new = 0.0;
1272  }
1273  alpha(idx_up) = a_up_new;
1274  alpha(idx_down) = a_down_new;
1275  mu(idx_up) += m;
1276  mu(idx_down) -= m;
1277 
1278  // update gradient and total gain
1279  const double dg = 0.5 * m * qq;
1280  gradient(idx_up) -= dg;
1281  gradient(idx_down) += dg;
1282  gain += m * (grad - 2.0 * dg);
1283  }
1284  else
1285  {
1286  // perform step
1287  const double a = alpha(idx);
1288  const double a_sum = alpha(m_classes);
1289  double m = grad / qq;
1290  double a_new = a + m;
1291  double a_sum_new = a_sum + m;
1292  if (a_new <= 0.0)
1293  {
1294  m = -a;
1295  a_new = 0.0;
1296  a_sum_new = a_sum + m;
1297  }
1298  else if (a_sum_new >= C)
1299  {
1300  m = C - a_sum;
1301  a_sum_new = C;
1302  a_new = a + m;
1303  }
1304  alpha(idx) = a_new;
1305  alpha(m_classes) = a_sum_new;
1306  mu(idx) += m;
1307 
1308  // update gradient and total gain
1309  const double dg = 0.5 * m * qq;
1310  for (std::size_t c=0; c<m_classes; c++) gradient(c) -= dg;
1311  gradient(idx) -= dg;
1312  gain += m * (grad - dg);
1313  }
1314  }
1315 
1316  return gain;
1317  }
1318 
1319 protected:
1323 };
1324 
1325 
1326 /// \brief Solver for the multi-class SVM with absolute margin and discriminative maximum loss.
1327 template <class InputT>
1328 class QpMcLinearADM : public QpMcLinear<InputT>
1329 {
1330 public:
1332 
1333  /// \brief Constructor
1335  const DatasetType& dataset,
1336  std::size_t dim,
1337  std::size_t classes)
1338  : QpMcLinear<InputT>(dataset, dim, classes)
1339  { }
1340 
1341 protected:
1342  /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
1343  virtual double calcGradient(RealVector& gradient, RealVector wx, RealMatrixRow const& alpha, double C, unsigned int y)
1344  {
1345  if (alpha(m_classes) < C)
1346  {
1347  double violation = 0.0;
1348  for (std::size_t c=0; c<m_classes; c++)
1349  {
1350  if (c == y)
1351  {
1352  gradient(c) = 0.0;
1353  }
1354  else
1355  {
1356  const double g = 1.0 + wx(c);
1357  gradient(c) = g;
1358  if (g > violation) violation = g;
1359  else if (-g > violation && alpha(c) > 0.0) violation = -g;
1360  }
1361  }
1362  return violation;
1363  }
1364  else
1365  {
1366  double kkt_up = 0.0, kkt_down = 1e100;
1367  for (std::size_t c=0; c<m_classes; c++)
1368  {
1369  if (c == y)
1370  {
1371  gradient(c) = 0.0;
1372  }
1373  else
1374  {
1375  const double g = 1.0 + wx(c);
1376  gradient(c) = g;
1377  if (g > kkt_up && alpha(c) < C) kkt_up = g;
1378  if (g < kkt_down && alpha(c) > 0.0) kkt_down = g;
1379  }
1380  }
1381  return std::max(0.0, kkt_up - kkt_down);
1382  }
1383  }
1384 
1385  /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
1386  virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index)
1387  {
1388  double mean_mu = 0.0;
1389  for (std::size_t c=0; c<m_classes; c++) mean_mu += mu(c);
1390  mean_mu /= (double)m_classes;
1391  RealVector step(m_classes);
1392  for (size_t c=0; c<m_classes; c++) step(c) = mean_mu - mu(c);
1393  add_scaled(w, step, m_data[index].input);
1394  }
1395 
1396  /// \brief Solve the sub-problem posed by a single training example.
1397  virtual double solveSub(double epsilon, RealVector gradient, double q, double C, unsigned int y, RealMatrixRow& alpha, RealVector& mu)
1398  {
1399  const double ood = 1.0 / m_classes;
1400  const double qq = (1.0 - ood) * q;
1401  double gain = 0.0;
1402 
1403  // SMO loop
1404  size_t iter, maxiter = MAXITER_MULTIPLIER * m_classes;
1405  for (iter=0; iter<maxiter; iter++)
1406  {
1407  // select working set
1408  std::size_t idx = 0;
1409  std::size_t idx_up = 0, idx_down = 0;
1410  bool size2 = false;
1411  double kkt = 0.0;
1412  double grad = 0.0;
1413  if (alpha(m_classes) == C)
1414  {
1415  double kkt_up = -1e100, kkt_down = 1e100;
1416  for (std::size_t c=0; c<m_classes; c++)
1417  {
1418  if (c == y) continue;
1419 
1420  const double g = gradient(c);
1421  const double a = alpha(c);
1422  if (g > kkt_up && a < C) { kkt_up = g; idx_up = c; }
1423  if (g < kkt_down && a > 0.0) { kkt_down = g; idx_down = c; }
1424  }
1425 
1426  if (kkt_up <= 0.0)
1427  {
1428  idx = idx_down;
1429  grad = kkt_down;
1430  kkt = -kkt_down;
1431  }
1432  else
1433  {
1434  grad = kkt_up - kkt_down;
1435  kkt = grad;
1436  size2 = true;
1437  }
1438  }
1439  else
1440  {
1441  for (std::size_t c=0; c<m_classes; c++)
1442  {
1443  if (c == y) continue;
1444 
1445  const double g = gradient(c);
1446  const double a = alpha(c);
1447  if (g > kkt) { kkt = g; idx = c; }
1448  else if (-g > kkt && a > 0.0) { kkt = -g; idx = c; }
1449  }
1450  grad = gradient(idx);
1451  }
1452 
1453  // check stopping criterion
1454  if (kkt < epsilon) return gain;
1455 
1456  if (size2)
1457  {
1458  // perform step
1459  const double a_up = alpha(idx_up);
1460  const double a_down = alpha(idx_down);
1461  double m = grad / (2.0 * q);
1462  double a_up_new = a_up + m;
1463  double a_down_new = a_down - m;
1464  if (a_down_new <= 0.0)
1465  {
1466  m = a_down;
1467  a_up_new = a_up + m;
1468  a_down_new = 0.0;
1469  }
1470  alpha(idx_up) = a_up_new;
1471  alpha(idx_down) = a_down_new;
1472  mu(idx_up) += m;
1473  mu(idx_down) -= m;
1474 
1475  // update gradient and total gain
1476  const double dg = m * q;
1477  const double dgc = dg / m_classes;
1478  gradient(idx_up) -= dg;
1479  gradient(idx_down) += dg;
1480  gain += m * (grad - (dg - dgc));
1481  }
1482  else
1483  {
1484  // perform step
1485  const double a = alpha(idx);
1486  const double a_sum = alpha(m_classes);
1487  double m = grad / qq;
1488  double a_new = a + m;
1489  double a_sum_new = a_sum + m;
1490  if (a_new <= 0.0)
1491  {
1492  m = -a;
1493  a_new = 0.0;
1494  a_sum_new = a_sum + m;
1495  }
1496  else if (a_sum_new >= C)
1497  {
1498  m = C - a_sum;
1499  a_sum_new = C;
1500  a_new = a + m;
1501  }
1502  alpha(idx) = a_new;
1503  alpha(m_classes) = a_sum_new;
1504  mu(idx) += m;
1505 
1506  // update gradient and total gain
1507  const double dg = m * q;
1508  const double dgc = dg / m_classes;
1509  for (std::size_t c=0; c<m_classes; c++) gradient(c) += dgc;
1510  gradient(idx) -= dg;
1511  gain += m * (grad - 0.5 * (dg - dgc));
1512  }
1513  }
1514 
1515  return gain;
1516  }
1517 
1518 protected:
1522 };
1523 
1524 
1525 /// \brief Solver for the multi-class SVM with absolute margin and total maximum loss.
1526 template <class InputT>
1527 class QpMcLinearATM : public QpMcLinear<InputT>
1528 {
1529 public:
1531 
1532  /// \brief Constructor
1534  const DatasetType& dataset,
1535  std::size_t dim,
1536  std::size_t classes)
1537  : QpMcLinear<InputT>(dataset, dim, classes)
1538  { }
1539 
1540 protected:
1541  /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
1542  virtual double calcGradient(RealVector& gradient, RealVector wx, RealMatrixRow const& alpha, double C, unsigned int y)
1543  {
1544  if (alpha(m_classes) < C)
1545  {
1546  double violation = 0.0;
1547  for (std::size_t c=0; c<m_classes; c++)
1548  {
1549  const double g = (c == y) ? 1.0 - wx(y) : 1.0 + wx(c);
1550  gradient(c) = g;
1551  if (g > violation) violation = g;
1552  else if (-g > violation && alpha(c) > 0.0) violation = -g;
1553  }
1554  return violation;
1555  }
1556  else
1557  {
1558  double kkt_up = 0.0, kkt_down = 1e100;
1559  for (std::size_t c=0; c<m_classes; c++)
1560  {
1561  const double g = (c == y) ? 1.0 - wx(y) : 1.0 + wx(c);
1562  gradient(c) = g;
1563  if (g > kkt_up && alpha(c) < C) kkt_up = g;
1564  if (g < kkt_down && alpha(c) > 0.0) kkt_down = g;
1565  }
1566  return std::max(0.0, kkt_up - kkt_down);
1567  }
1568  }
1569 
1570  /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
1571  virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index)
1572  {
1573  unsigned int y = m_data[index].label;
1574  double mean = -2.0 * mu(y);
1575  for (std::size_t c=0; c<m_classes; c++) mean += mu(c);
1576  mean /= (double)m_classes;
1577  RealVector step(m_classes);
1578  for (size_t c=0; c<m_classes; c++) step(c) = (c == y) ? (mu(c) + mean) : (mean - mu(c));
1579  add_scaled(w, step, m_data[index].input);
1580  }
1581 
1582  /// \brief Solve the sub-problem posed by a single training example.
1583  virtual double solveSub(double epsilon, RealVector gradient, double q, double C, unsigned int y, RealMatrixRow& alpha, RealVector& mu)
1584  {
1585  const double ood = 1.0 / m_classes;
1586  const double qq = (1.0 - ood) * q;
1587  double gain = 0.0;
1588 
1589  // SMO loop
1590  size_t iter, maxiter = MAXITER_MULTIPLIER * m_classes;
1591  for (iter=0; iter<maxiter; iter++)
1592  {
1593  // select working set
1594  std::size_t idx = 0;
1595  std::size_t idx_up = 0, idx_down = 0;
1596  bool size2 = false;
1597  double kkt = 0.0;
1598  double grad = 0.0;
1599  if (alpha(m_classes) == C)
1600  {
1601  double kkt_up = -1e100, kkt_down = 1e100;
1602  for (std::size_t c=0; c<m_classes; c++)
1603  {
1604  const double g = gradient(c);
1605  const double a = alpha(c);
1606  if (g > kkt_up && a < C) { kkt_up = g; idx_up = c; }
1607  if (g < kkt_down && a > 0.0) { kkt_down = g; idx_down = c; }
1608  }
1609 
1610  if (kkt_up <= 0.0)
1611  {
1612  idx = idx_down;
1613  grad = kkt_down;
1614  kkt = -kkt_down;
1615  }
1616  else
1617  {
1618  grad = kkt_up - kkt_down;
1619  kkt = grad;
1620  size2 = true;
1621  }
1622  }
1623  else
1624  {
1625  for (std::size_t c=0; c<m_classes; c++)
1626  {
1627  const double g = gradient(c);
1628  const double a = alpha(c);
1629  if (g > kkt) { kkt = g; idx = c; }
1630  else if (-g > kkt && a > 0.0) { kkt = -g; idx = c; }
1631  }
1632  grad = gradient(idx);
1633  }
1634 
1635  // check stopping criterion
1636  if (kkt < epsilon) return gain;
1637 
1638  if (size2)
1639  {
1640  // perform step
1641  const double a_up = alpha(idx_up);
1642  const double a_down = alpha(idx_down);
1643  double m = grad / (2.0 * q);
1644  double a_up_new = a_up + m;
1645  double a_down_new = a_down - m;
1646  if (a_down_new <= 0.0)
1647  {
1648  m = a_down;
1649  a_up_new = a_up + m;
1650  a_down_new = 0.0;
1651  }
1652  alpha(idx_up) = a_up_new;
1653  alpha(idx_down) = a_down_new;
1654  mu(idx_up) += m;
1655  mu(idx_down) -= m;
1656 
1657  // update gradient and total gain
1658  const double dg = m * q;
1659  const double dgc = dg / m_classes;
1660  if (idx_up == y)
1661  {
1662  for (std::size_t c=0; c<m_classes; c++) gradient(c) -= dgc;
1663  gradient(idx_up) -= dg - 2.0 * dgc;
1664  gradient(idx_down) += dg;
1665  }
1666  else if (idx_down == y)
1667  {
1668  gradient(idx_up) -= dg;
1669  gradient(idx_down) += dg - 2.0 * dgc;
1670  }
1671  else
1672  {
1673  gradient(idx_up) -= dg;
1674  gradient(idx_down) += dg;
1675  }
1676  gain += m * (grad - (dg - dgc));
1677  }
1678  else
1679  {
1680  // perform step
1681  const double a = alpha(idx);
1682  const double a_sum = alpha(m_classes);
1683  double m = grad / qq;
1684  double a_new = a + m;
1685  double a_sum_new = a_sum + m;
1686  if (a_new <= 0.0)
1687  {
1688  m = -a;
1689  a_new = 0.0;
1690  a_sum_new = a_sum + m;
1691  }
1692  else if (a_sum_new >= C)
1693  {
1694  m = C - a_sum;
1695  a_sum_new = C;
1696  a_new = a + m;
1697  }
1698  alpha(idx) = a_new;
1699  alpha(m_classes) = a_sum_new;
1700  mu(idx) += m;
1701 
1702  // update gradient and total gain
1703  const double dg = m * q;
1704  const double dgc = dg / m_classes;
1705  if (idx == y)
1706  {
1707  for (std::size_t c=0; c<m_classes; c++) gradient(c) -= dgc;
1708  gradient(idx) -= dg - 2.0 * dgc;
1709  }
1710  else
1711  {
1712  for (std::size_t c=0; c<m_classes; c++) gradient(c) += (c == y) ? -dgc : dgc;
1713  gradient(idx) -= dg;
1714  }
1715  gain += m * (grad - 0.5 * (dg - dgc));
1716  }
1717  }
1718 
1719  return gain;
1720  }
1721 
1722 protected:
1726 };
1727 
1728 
1729 /// \brief Solver for the "reinforced" multi-class SVM.
1730 template <class InputT>
1731 class QpMcLinearReinforced : public QpMcLinear<InputT>
1732 {
1733 public:
1735 
1736  /// \brief Constructor
1738  const DatasetType& dataset,
1739  std::size_t dim,
1740  std::size_t classes)
1741  : QpMcLinear<InputT>(dataset, dim, classes)
1742  { }
1743 
1744 protected:
1745  /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
1746  virtual double calcGradient(RealVector& gradient, RealVector wx, RealMatrixRow const& alpha, double C, unsigned int y)
1747  {
1748  double violation = 0.0;
1749  for (std::size_t c=0; c<m_classes; c++)
1750  {
1751  const double g = (c == y) ? (m_classes - 1.0) - wx(y) : 1.0 + wx(c);
1752  gradient(c) = g;
1753  if (g > violation && alpha(c) < C) violation = g;
1754  else if (-g > violation && alpha(c) > 0.0) violation = -g;
1755  }
1756  return violation;
1757  }
1758 
1759  /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
1760  virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index)
1761  {
1762  unsigned int y = m_data[index].label;
1763  double mean = -2.0 * mu(y);
1764  for (std::size_t c=0; c<m_classes; c++) mean += mu(c);
1765  mean /= (double)m_classes;
1766  RealVector step(m_classes);
1767  for (std::size_t c=0; c<m_classes; c++) step(c) = ((c == y) ? (mu(c) + mean) : (mean - mu(c)));
1768  add_scaled(w, step, m_data[index].input);
1769  }
1770 
1771  /// \brief Solve the sub-problem posed by a single training example.
1772  virtual double solveSub(double epsilon, RealVector gradient, double q, double C, unsigned int y, RealMatrixRow& alpha, RealVector& mu)
1773  {
1774  const double ood = 1.0 / m_classes;
1775  const double qq = (1.0 - ood) * q;
1776  double gain = 0.0;
1777 
1778  // SMO loop
1779  size_t iter, maxiter = MAXITER_MULTIPLIER * m_classes;
1780  for (iter=0; iter<maxiter; iter++)
1781  {
1782  // select working set
1783  std::size_t idx = 0;
1784  double kkt = 0.0;
1785  for (std::size_t c=0; c<m_classes; c++)
1786  {
1787  const double g = gradient(c);
1788  const double a = alpha(c);
1789  if (g > kkt && a < C) { kkt = g; idx = c; }
1790  else if (-g > kkt && a > 0.0) { kkt = -g; idx = c; }
1791  }
1792 
1793  // check stopping criterion
1794  if (kkt < epsilon) break;
1795 
1796  // perform step
1797  const double a = alpha(idx);
1798  const double g = gradient(idx);
1799  double m = g / qq;
1800  double a_new = a + m;
1801  if (a_new <= 0.0)
1802  {
1803  m = -a;
1804  a_new = 0.0;
1805  }
1806  else if (a_new >= C)
1807  {
1808  m = C - a;
1809  a_new = C;
1810  }
1811  alpha(idx) = a_new;
1812  mu(idx) += m;
1813 
1814  // update gradient and total gain
1815  const double dg = m * q;
1816  const double dgc = dg / m_classes;
1817  if (idx == y)
1818  {
1819  for (std::size_t c=0; c<m_classes; c++) gradient(c) -= dgc;
1820  gradient(idx) -= dg - 2.0 * dgc;
1821  }
1822  else
1823  {
1824  for (std::size_t c=0; c<m_classes; c++) gradient(c) += (c == y) ? -dgc : dgc;
1825  gradient(idx) -= dg;
1826  }
1827 
1828  gain += m * (g - 0.5 * (dg - dgc));
1829  }
1830 
1831  return gain;
1832  }
1833 
1834 protected:
1838 };
1839 
1840 
1841 }
1842 #endif