QpBoxLinear.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Quadratic programming solver linear SVM training without bias
6  *
7  *
8  *
9  *
10  * \author T. Glasmachers
11  * \date -
12  *
13  *
14  * \par Copyright 1995-2016 Shark Development Team
15  *
16  * <BR><HR>
17  * This file is part of Shark.
18  * <http://image.diku.dk/shark/>
19  *
20  * Shark is free software: you can redistribute it and/or modify
21  * it under the terms of the GNU Lesser General Public License as published
22  * by the Free Software Foundation, either version 3 of the License, or
23  * (at your option) any later version.
24  *
25  * Shark is distributed in the hope that it will be useful,
26  * but WITHOUT ANY WARRANTY; without even the implied warranty of
27  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28  * GNU Lesser General Public License for more details.
29  *
30  * You should have received a copy of the GNU Lesser General Public License
31  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
32  *
33  */
34 //===========================================================================
35 
36 
37 #ifndef SHARK_ALGORITHMS_QP_QPBOXLINEAR_H
38 #define SHARK_ALGORITHMS_QP_QPBOXLINEAR_H
39 
40 #include <shark/Core/Timer.h>
42 #include <shark/Data/Dataset.h>
43 #include <shark/Data/DataView.h>
44 #include <shark/LinAlg/Base.h>
45 #include <cmath>
46 #include <iostream>
47 
48 
49 namespace shark {
50 
51 
52 // strategy constants
53 #define CHANGE_RATE 0.2
54 #define PREF_MIN 0.05
55 #define PREF_MAX 20.0
56 
57 
58 ///
59 /// \brief Quadratic program solver for box-constrained problems with linear kernel
60 ///
61 /// \par
62 /// The QpBoxLinear class is a decomposition-based solver for linear
63 /// support vector machine classifiers trained with the hinge loss.
64 /// Its optimization is largely based on the paper<br>
65 /// "A Dual Coordinate Descent Method for Large-scale Linear SVM"
66 /// by Hsieh, Chang, and Lin, ICML, 2007.
67 /// However, the present algorithm differs quite a bit, since it
68 /// learns variable preferences as a replacement for the missing
69 /// working set selection. At the same time, this method replaces
70 /// the shrinking heuristic.
71 ///
72 template <class InputT>
74 {
75 public:
78 
79  ///
80  /// \brief Constructor
81  ///
82  /// \param dataset training data
83  /// \param dim problem dimension
84  ///
85  QpBoxLinear(const DatasetType& dataset, std::size_t dim)
86  : m_data(dataset)
88  , m_dim(dim)
89  {
90  SHARK_ASSERT(dim > 0);
91 
92  // pre-compute squared norms
93  for (std::size_t i=0; i<m_data.size(); i++)
94  {
95  ElementType x_i = m_data[i];
96  m_xSquared(i) = inner_prod(x_i.input, x_i.input);
97  }
98  }
99 
100  ///
101  /// \brief Solve the SVM training problem.
102  ///
103  /// \param bound upper bound for alpha-components, complexity parameter of the hinge loss SVM
104  /// \param reg coefficient of the penalty term \f$-\frac{reg}{2} \cdot \|\alpha\|^2\f$, reg=1/C where C is the complexity parameter of the squared hinge loss SVM
105  /// \param stop stopping condition(s)
106  /// \param prop solution properties
107  /// \param verbose if true, the solver prints status information and solution statistics
108  ///
109  RealVector solve(
110  double bound,
111  double reg,
112  QpStoppingCondition& stop,
113  QpSolutionProperties* prop = NULL,
114  bool verbose = false)
115  {
116  // sanity checks
117  SHARK_ASSERT(bound > 0.0);
118  SHARK_ASSERT(reg >= 0.0);
119 
120  // measure training time
121  Timer timer;
122  timer.start();
123 
124  // prepare dimensions and vectors
125  std::size_t ell = m_data.size();
126  RealVector alpha(ell, 0.0);
127  RealVector w(m_dim, 0.0);
128  RealVector pref(ell, 1.0); // measure of success of individual steps
129  double prefsum = ell; // normalization constant
130  std::vector<std::size_t> schedule(ell);
131 
132  // prepare counters
133  std::size_t epoch = 0;
134  std::size_t steps = 0;
135 
136  // prepare performance monitoring for self-adaptation
137  double max_violation = 0.0;
138  const double gain_learning_rate = 1.0 / ell;
139  double average_gain = 0.0;
140  bool canstop = true;
141 
142  // outer optimization loop
143  while (true)
144  {
145  // define schedule
146  double psum = prefsum;
147  prefsum = 0.0;
148  std::size_t pos = 0;
149  for (std::size_t i=0; i<ell; i++)
150  {
151  double p = pref[i];
152  double num = (psum < 1e-6) ? ell - pos : std::min((double)(ell - pos), (ell - pos) * p / psum);
153  std::size_t n = (std::size_t)std::floor(num);
154  double prob = num - n;
155  if (Rng::uni() < prob) n++;
156  for (std::size_t j=0; j<n; j++)
157  {
158  schedule[pos] = i;
159  pos++;
160  }
161  psum -= p;
162  prefsum += p;
163  }
164  SHARK_ASSERT(pos == ell);
165  for (std::size_t i=0; i<ell; i++) std::swap(schedule[i], schedule[Rng::discrete(0, ell - 1)]);
166 
167  // inner loop
168  max_violation = 0.0;
169  for (std::size_t j=0; j<ell; j++)
170  {
171  // active variable
172  std::size_t i = schedule[j];
173  ElementType e_i = m_data[i];
174  double y_i = (e_i.label > 0) ? +1.0 : -1.0;
175 
176  // compute gradient and projected gradient
177  double a = alpha(i);
178  double wyx = y_i * inner_prod(w, e_i.input);
179  double g = 1.0 - wyx - reg * a;
180  double pg = (a == 0.0 && g < 0.0) ? 0.0 : (a == bound && g > 0.0 ? 0.0 : g);
181 
182  // update maximal KKT violation over the epoch
183  max_violation = std::max(max_violation, std::abs(pg));
184  double gain = 0.0;
185 
186  // perform the step
187  if (pg != 0.0)
188  {
189  // SMO-style coordinate descent step
190  double q = m_xSquared(i) + reg;
191  double mu = g / q;
192  double new_a = a + mu;
193 
194  // numerically stable update
195  if (new_a <= 0.0)
196  {
197  mu = -a;
198  new_a = 0.0;
199  }
200  else if (new_a >= bound)
201  {
202  mu = bound - a;
203  new_a = bound;
204  }
205 
206  // update both representations of the weight vector: alpha and w
207  alpha(i) = new_a;
208  w += (mu * y_i) * e_i.input;
209  gain = mu * (g - 0.5 * q * mu);
210 
211  steps++;
212  }
213 
214  // update gain-based preferences
215  {
216  if (epoch == 0) average_gain += gain / (double)ell;
217  else
218  {
219  double change = CHANGE_RATE * (gain / average_gain - 1.0);
220  double newpref = std::min(PREF_MAX, std::max(PREF_MIN, pref(i) * std::exp(change)));
221  prefsum += newpref - pref(i);
222  pref[i] = newpref;
223  average_gain = (1.0 - gain_learning_rate) * average_gain + gain_learning_rate * gain;
224  }
225  }
226  }
227 
228  epoch++;
229 
230  // stopping criteria
231  if (stop.maxIterations > 0 && ell * epoch >= stop.maxIterations)
232  {
233  if (prop != NULL) prop->type = QpMaxIterationsReached;
234  break;
235  }
236 
237  if (timer.stop() >= stop.maxSeconds)
238  {
239  if (prop != NULL) prop->type = QpTimeout;
240  break;
241  }
242 
243  if (max_violation < stop.minAccuracy)
244  {
245  if (verbose) std::cout << "#" << std::flush;
246  if (canstop)
247  {
248  if (prop != NULL) prop->type = QpAccuracyReached;
249  break;
250  }
251  else
252  {
253  // prepare full sweep for a reliable checking of the stopping criterion
254  canstop = true;
255  for (std::size_t i=0; i<ell; i++) pref[i] = 1.0;
256  prefsum = ell;
257  }
258  }
259  else
260  {
261  if (verbose) std::cout << "." << std::flush;
262  canstop = false;
263  }
264  }
265 
266  timer.stop();
267 
268  // compute solution statistics
269  std::size_t free_SV = 0;
270  std::size_t bounded_SV = 0;
271  double objective = -0.5 * shark::blas::inner_prod(w, w);
272  for (std::size_t i=0; i<ell; i++)
273  {
274  double a = alpha(i);
275  if (a > 0.0)
276  {
277  objective += a;
278  objective -= reg/2.0 * a * a;
279  if (a == bound) bounded_SV++;
280  else free_SV++;
281  }
282  }
283 
284  // return solution statistics
285  if (prop != NULL)
286  {
287  prop->accuracy = max_violation; // this is approximate, but a good guess
288  prop->iterations = ell * epoch;
289  prop->value = objective;
290  prop->seconds = timer.lastLap();
291  }
292 
293  // output solution statistics
294  if (verbose)
295  {
296  std::cout << std::endl;
297  std::cout << "training time (seconds): " << timer.lastLap() << std::endl;
298  std::cout << "number of epochs: " << epoch << std::endl;
299  std::cout << "number of iterations: " << (ell * epoch) << std::endl;
300  std::cout << "number of non-zero steps: " << steps << std::endl;
301  std::cout << "dual accuracy: " << max_violation << std::endl;
302  std::cout << "dual objective value: " << objective << std::endl;
303  std::cout << "number of free support vectors: " << free_SV << std::endl;
304  std::cout << "number of bounded support vectors: " << bounded_SV << std::endl;
305  }
306 
307  // return the solution
308  return w;
309  }
310 
311 protected:
312  DataView<const DatasetType> m_data; ///< view on training data
313  RealVector m_xSquared; ///< diagonal entries of the quadratic matrix
314  std::size_t m_dim; ///< input space dimension
315 };
316 
317 
318 template < >
319 class QpBoxLinear<CompressedRealVector>
320 {
321 public:
323 
324  ///
325  /// \brief Constructor
326  ///
327  /// \param dataset training data
328  /// \param dim problem dimension
329  ///
330  QpBoxLinear(const DatasetType& dataset, std::size_t dim)
331  : x(dataset.numberOfElements())
332  , y(dataset.numberOfElements())
333  , diagonal(dataset.numberOfElements())
334  , m_dim(dim)
335  {
336  SHARK_ASSERT(dim > 0);
337 
338  // transform ublas sparse vectors into a fast format
339  // (yes, ublas is slow...), and compute the diagonal
340  // elements of the quadratic matrix
341  SparseVector sparse;
342  for (std::size_t b=0, j=0; b<dataset.numberOfBatches(); b++)
343  {
344  DatasetType::const_batch_reference batch = dataset.batch(b);
345  for (std::size_t i=0; i<batch.size(); i++)
346  {
347  CompressedRealVector x_i = shark::get(batch, i).input;
348  // if (x_i.nnz() == 0) continue;
349 
350  unsigned int y_i = shark::get(batch, i).label;
351  y[j] = 2.0 * y_i - 1.0;
352  double d = 0.0;
353  for (CompressedRealVector::const_iterator it=x_i.begin(); it != x_i.end(); ++it)
354  {
355  double v = *it;
356  sparse.index = it.index();
357  sparse.value = v;
358  storage.push_back(sparse);
359  d += v * v;
360  }
361  sparse.index = (std::size_t)-1;
362  storage.push_back(sparse);
363  diagonal(j) = d;
364  j++;
365  }
366  }
367  for (std::size_t b=0, j=0, k=0; b<dataset.numberOfBatches(); b++)
368  {
369  DatasetType::const_batch_reference batch = dataset.batch(b);
370  for (std::size_t i=0; i<batch.size(); i++)
371  {
372  CompressedRealVector x_i = shark::get(batch, i).input;
373  // if (x_i.nnz() == 0) continue;
374 
375  x[j] = &storage[k]; // cannot be done in the first loop because of vector reallocation
376  for (; storage[k].index != (std::size_t)-1; k++);
377  k++;
378  j++;
379  }
380  }
381  }
382 
383  ///
384  /// \brief Solve the SVM training problem.
385  ///
386  /// \param bound upper bound for alpha-components, complexity parameter of the hinge loss SVM
387  /// \param reg coefficient of the penalty term \f$-\frac{reg}{2} \cdot \|\alpha\|^2\f$, reg=1/C where C is the complexity parameter of the squared hinge loss SVM
388  /// \param stop stopping condition(s)
389  /// \param prop solution properties
390  /// \param verbose if true, the solver prints status information and solution statistics
391  ///
392  RealVector solve(
393  double bound,
394  double reg,
395  QpStoppingCondition& stop,
396  QpSolutionProperties* prop = NULL,
397  bool verbose = false)
398  {
399  // sanity checks
400  SHARK_ASSERT(bound > 0.0);
401  SHARK_ASSERT(reg >= 0.0);
402 
403  // measure training time
404  Timer timer;
405  timer.start();
406 
407  // prepare dimensions and vectors
408  std::size_t ell = x.size();
409  RealVector alpha(ell, 0.0);
410  RealVector w(m_dim, 0.0);
411  RealVector pref(ell, 1.0); // measure of success of individual steps
412  double prefsum = ell; // normalization constant
413  std::vector<std::size_t> schedule(ell);
414 
415  // prepare counters
416  std::size_t epoch = 0;
417  std::size_t steps = 0;
418 
419  // prepare performance monitoring for self-adaptation
420  double max_violation = 0.0;
421  const double gain_learning_rate = 1.0 / ell;
422  double average_gain = 0.0;
423  bool canstop = true;
424 
425  // outer optimization loop
426  while (true)
427  {
428  // define schedule
429  double psum = prefsum;
430  prefsum = 0.0;
431  std::size_t pos = 0;
432  for (std::size_t i=0; i<ell; i++)
433  {
434  double p = pref[i];
435  double num = (psum < 1e-6) ? ell - pos : std::min((double)(ell - pos), (ell - pos) * p / psum);
436  std::size_t n = (std::size_t)std::floor(num);
437  double prob = num - n;
438  if (Rng::uni() < prob) n++;
439  for (std::size_t j=0; j<n; j++)
440  {
441  schedule[pos] = i;
442  pos++;
443  }
444  psum -= p;
445  prefsum += p;
446  }
447  SHARK_ASSERT(pos == ell);
448  for (std::size_t i=0; i<ell; i++) std::swap(schedule[i], schedule[Rng::discrete(0, ell - 1)]);
449 
450  // inner loop
451  max_violation = 0.0;
452  for (std::size_t j=0; j<ell; j++)
453  {
454  // active variable
455  std::size_t i = schedule[j];
456  const SparseVector* x_i = x[i];
457 
458  // compute gradient and projected gradient
459  double a = alpha(i);
460  double wyx = y(i) * inner_prod(w, x_i);
461  double g = 1.0 - wyx - reg * a;
462  double pg = (a == 0.0 && g < 0.0) ? 0.0 : (a == bound && g > 0.0 ? 0.0 : g);
463 
464  // update maximal KKT violation over the epoch
465  max_violation = std::max(max_violation, std::abs(pg));
466  double gain = 0.0;
467 
468  // perform the step
469  if (pg != 0.0)
470  {
471  // SMO-style coordinate descent step
472  double q = diagonal(i) + reg;
473  double mu = g / q;
474  double new_a = a + mu;
475 
476  // numerically stable update
477  if (new_a <= 0.0)
478  {
479  mu = -a;
480  new_a = 0.0;
481  }
482  else if (new_a >= bound)
483  {
484  mu = bound - a;
485  new_a = bound;
486  }
487 
488  // update both representations of the weight vector: alpha and w
489  alpha(i) = new_a;
490  // w += (mu * y(i)) * x_i;
491  axpy(w, mu * y(i), x_i);
492  gain = mu * (g - 0.5 * q * mu);
493 
494  steps++;
495  }
496 
497  // update gain-based preferences
498  {
499  if (epoch == 0) average_gain += gain / (double)ell;
500  else
501  {
502  double change = CHANGE_RATE * (gain / average_gain - 1.0);
503  double newpref = std::min(PREF_MAX, std::max(PREF_MIN, pref(i) * std::exp(change)));
504  prefsum += newpref - pref(i);
505  pref[i] = newpref;
506  average_gain = (1.0 - gain_learning_rate) * average_gain + gain_learning_rate * gain;
507  }
508  }
509  }
510 
511  epoch++;
512 
513  // stopping criteria
514  if (stop.maxIterations > 0 && ell * epoch >= stop.maxIterations)
515  {
516  if (prop != NULL) prop->type = QpMaxIterationsReached;
517  break;
518  }
519 
520  if (timer.stop() >= stop.maxSeconds)
521  {
522  if (prop != NULL) prop->type = QpTimeout;
523  break;
524  }
525 
526  if (max_violation < stop.minAccuracy)
527  {
528  if (verbose) std::cout << "#" << std::flush;
529  if (canstop)
530  {
531  if (prop != NULL) prop->type = QpAccuracyReached;
532  break;
533  }
534  else
535  {
536  // prepare full sweep for a reliable checking of the stopping criterion
537  canstop = true;
538  for (std::size_t i=0; i<ell; i++) pref[i] = 1.0;
539  prefsum = ell;
540  }
541  }
542  else
543  {
544  if (verbose) std::cout << "." << std::flush;
545  canstop = false;
546  }
547  }
548 
549  timer.stop();
550 
551  // compute solution statistics
552  std::size_t free_SV = 0;
553  std::size_t bounded_SV = 0;
554  double objective = -0.5 * shark::blas::inner_prod(w, w);
555  for (std::size_t i=0; i<ell; i++)
556  {
557  double a = alpha(i);
558  if (a > 0.0)
559  {
560  objective += a;
561  objective -= reg/2.0 * a * a;
562  if (a == bound) bounded_SV++;
563  else free_SV++;
564  }
565  }
566 
567  // return solution statistics
568  if (prop != NULL)
569  {
570  prop->accuracy = max_violation; // this is approximate, but a good guess
571  prop->iterations = ell * epoch;
572  prop->value = objective;
573  prop->seconds = timer.lastLap();
574  }
575 
576  // output solution statistics
577  if (verbose)
578  {
579  std::cout << std::endl;
580  std::cout << "training time (seconds): " << timer.lastLap() << std::endl;
581  std::cout << "number of epochs: " << epoch << std::endl;
582  std::cout << "number of iterations: " << (ell * epoch) << std::endl;
583  std::cout << "number of non-zero steps: " << steps << std::endl;
584  std::cout << "dual accuracy: " << max_violation << std::endl;
585  std::cout << "dual objective value: " << objective << std::endl;
586  std::cout << "number of free support vectors: " << free_SV << std::endl;
587  std::cout << "number of bounded support vectors: " << bounded_SV << std::endl;
588  }
589 
590  // return the solution
591  return w;
592  }
593 
594 protected:
595  /// \brief Data structure for sparse vectors.
596  struct SparseVector
597  {
598  std::size_t index;
599  double value;
600  };
601 
602  /// \brief Famous "axpy" product, here adding a multiple of a sparse vector to a dense one.
603  static inline void axpy(RealVector& w, double alpha, const SparseVector* xi)
604  {
605  while (true)
606  {
607  if (xi->index == (std::size_t)-1) return;
608  w[xi->index] += alpha * xi->value;
609  xi++;
610  }
611  }
612 
613  /// \brief Inner product between a dense and a sparse vector.
614  static inline double inner_prod(RealVector const& w, const SparseVector* xi)
615  {
616  double ret = 0.0;
617  while (true)
618  {
619  if (xi->index == (std::size_t)-1) return ret;
620  ret += w[xi->index] * xi->value;
621  xi++;
622  }
623  }
624 
625  std::vector<SparseVector> storage; ///< storage for sparse vectors
626  std::vector<SparseVector*> x; ///< sparse vectors
627  RealVector y; ///< +1/-1 labels
628  RealVector diagonal; ///< diagonal entries of the quadratic matrix
629  std::size_t m_dim; ///< input space dimension
630 };
631 
632 
633 }
634 #endif