QpMcDecomp.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Quadratic programming solver for multi-class SVMs
6  *
7  *
8  *
9  *
10  * \author T. Glasmachers
11  * \date 2007-2012
12  *
13  *
14  * \par Copyright 1995-2015 Shark Development Team
15  *
16  * <BR><HR>
17  * This file is part of Shark.
18  * <http://image.diku.dk/shark/>
19  *
20  * Shark is free software: you can redistribute it and/or modify
21  * it under the terms of the GNU Lesser General Public License as published
22  * by the Free Software Foundation, either version 3 of the License, or
23  * (at your option) any later version.
24  *
25  * Shark is distributed in the hope that it will be useful,
26  * but WITHOUT ANY WARRANTY; without even the implied warranty of
27  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28  * GNU Lesser General Public License for more details.
29  *
30  * You should have received a copy of the GNU Lesser General Public License
31  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
32  *
33  */
34 //===========================================================================
35 
36 
37 #ifndef SHARK_ALGORITHMS_QP_QPMCDECOMP_H
38 #define SHARK_ALGORITHMS_QP_QPMCDECOMP_H
39 
42 #include <shark/Core/Timer.h>
43 #include <shark/Data/Dataset.h>
44 
45 
46 namespace shark {
47 
48 
49 #define ITERATIONS_BETWEEN_SHRINKING 1000
50 
51 //todo: O.K.: inline fucntions?!?
52 #define GRADIENT_UPDATE(r, from, to, mu, q) \
53 { \
54  std::size_t a, b, p; \
55  for (a=(from); a<(to); a++) \
56  { \
57  double k = (q)[a]; \
58  tExample& ex = example[a]; \
59  typename QpSparseArray<QpFloatType>::Row const& row = M.row(classes * (r) + ex.y); \
60  QpFloatType def = row.defaultvalue; \
61  if (def == 0.0) \
62  { \
63  for (b=0; b<row.size; b++) \
64  { \
65  p = row.entry[b].index; \
66  gradient(ex.var[p]) -= (mu) * row.entry[b].value * k; \
67  } \
68  } \
69  else \
70  { \
71  for (b=0; b<row.size; b++) \
72  { \
73  p = row.entry[b].index; \
74  gradient(ex.var[p]) -= (mu) * (row.entry[b].value - def) * k; \
75  } \
76  double upd = (mu) * def * k; \
77  for (b=0; b<ex.active; b++) gradient(ex.avar[b]) -= upd; \
78  } \
79  } \
80 }
81 
82 #define GAIN_SELECTION_BOX(qif) \
83  f = exa.var[pf]; \
84  if (f >= activeVar) continue; \
85  double gf = gradient(f); \
86  double af = alpha(f); \
87  if ((af > 0.0 && gf < 0.0) || (af < C && gf > 0.0)) \
88  { \
89  double df = variable[f].diagonal; \
90  double diag_q = di * df; \
91  double det_q = diag_q - qif * qif; \
92  if (det_q < 1e-12 * diag_q) \
93  { \
94  if (di == 0.0 && df == 0.0) \
95  { if (f != i) { j = f; return ret; } } \
96  else \
97  { \
98  if (di * gf - df * gi != 0.0) \
99  { if (f != i) { j = f; return ret; } } \
100  else \
101  { \
102  double g2 = gf*gf + gi*gi; \
103  double gain = (g2*g2) / (gf*gf*df + 2.0*gf*gi*qif + gi*gi*di); \
104  if (gain > bestgain) { if (f != i) { bestgain = gain; j = f; } } \
105  } \
106  } \
107  } \
108  else \
109  { \
110  double gain = (gf*gf*di - 2.0*gf*gi*qif + gi*gi*df) / det_q; \
111  if (gain > bestgain) { if (f != i) { bestgain = gain; j = f; } } \
112  } \
113  }
114 
115 #define GAIN_SELECTION_TRIANGLE(qif) \
116  f = exa.var[pf]; \
117  if (f >= activeVar) continue; \
118  double gf = gradient(f); \
119  double af = alpha(f); \
120  if ((af > 0.0 && gf < 0.0) || (varsum < C && gf > 0.0)) \
121  { \
122  double df = variable[f].diagonal; \
123  double diag_q = di * df; \
124  double det_q = diag_q - qif * qif; \
125  if (det_q < 1e-12 * diag_q) \
126  { \
127  if (di == 0.0 && df == 0.0) \
128  { if (f != i) { j = f; return ret; } } \
129  else \
130  { \
131  if (di * gf - df * gi != 0.0) \
132  { if (f != i) { j = f; return ret; } } \
133  else \
134  { \
135  double g2 = gf*gf + gi*gi; \
136  double gain = (g2*g2) / (gf*gf*df + 2.0*gf*gi*qif + gi*gi*di); \
137  if (gain > bestgain) { if (f != i) { bestgain = gain; j = f; } } \
138  } \
139  } \
140  } \
141  else \
142  { \
143  double gain = (gf*gf*di - 2.0*gf*gi*qif + gi*gi*df) / det_q; \
144  if (gain > bestgain) { if (f != i) { bestgain = gain; j = f; } } \
145  } \
146  }
147 
148 
149 //!
150 //! \brief Quadratic program solver for multi class SVM problems
151 //!
152 //! \par
153 //! This quadratic program solver solves the following primal SVM problem:<br><br>
154 //! \f$ \min_{w, b, \xi} \quad \frac{1}{2} \sum_c \|w_c\|^2 + C \cdot \sum_{i,r} \xi_{i,r} \f$ <br>
155 //! \f$ \text{s.t.} \quad \forall i, p: \quad \sum_c \nu_{y_i,p,c} \big( \langle w_c, \phi(x_i) \rangle + b_c \big) \geq \gamma_{p,y_i} - \xi_{i,\rho(p)} \f$ <br>
156 //! \f$ \text{and} \quad \forall i, r: \quad \xi_{i,r} \geq 0 \f$ <br>
157 //! \f$ \text{and} \quad \left[ \sum_c \langle w_c, \phi(\cdot) \rangle + b_c = 0 \right] \f$
158 //!
159 //! \par
160 //! The last so-called sum-to-zero constraint in square brackets is optinal,
161 //! and so is the presence of the bias variables b. The index i runs over
162 //! all training examples, p runs over the constraint index set P, r runs
163 //! over the slack variables index set R, and c runs over the classes. The
164 //! coefficients \f$ \nu_{y,p,c} \f$ define the margin functions (absolute
165 //! or relative), \f$ \gamma_{y,p} \f$ are target margins for these functions,
166 //! \f$ \phi \f$ is a feature map for the kernel function, and
167 //! \f$ \rho : P \to R \f$ connects constraints and slack variables, and
168 //! depends on the surrogate loss function. Currently, the solver supports
169 //! only the cases \f$ \rho = \operatorname{id} \f$ (sum-loss) and
170 //! \f$ |R| = 1 \f$ (max-loss).
171 //!
172 //! \par
173 //! The solver applies a mixed strategy between solving the dual w.r.t. the
174 //! variables w for fixed b, and solving the primal for b with fixed w. This
175 //! strategy allows for the application of efficient decomposition algorithms
176 //! even to seemingly involved cases, like the multi-class SVM by
177 //! Crammer&amp;Singer with bias parameter.
178 //!
179 //! \par
180 //! The implementation of this solver is relatively efficient. It exploits
181 //! the sparsity of the quadratic term of the dual objective function for
182 //! relative margins, and it provides caching and shrinking techniques on the
183 //! level of training examples (and the kernel matrix) and variables
184 //! (for gradient updates).
185 //!
186 template <class Matrix>
188 {
189 public:
190  //////////////////////////////////////////////////////////////////
191  // The types below define the type used for caching kernel values. The default is float,
192  // since this type offers sufficient accuracy in the vast majority of cases, at a memory
193  // cost of only four bytes. However, the type definition makes it easy to use double instead
194  // (e.g., in case high accuracy training is needed).
195  typedef typename Matrix::QpFloatType QpFloatType;
199 
200  //! Constructor
201  //! \param kernel kernel matrix - cache or pre-computed matrix
202  //! \param _gamma _gamma(y, p) is the target margin of constraint p for examples of class y
203  //! \param _rho mapping from constraints to slack variables
204  //! \param _nu margin coefficients in the format \f$ \nu_{y,p,c} = _\nu(|P|*y+p, c) \f$
205  //! \param _M kernel modifiers in the format \f$ M_(y_i, p, y_j, q) = _M(classes*(y_i*|P|+p_i)+y_j, q) \f$
206  //! \param sumToZeroConstraint enable or disable the sum-to-zero constraint
207  QpMcDecomp(Matrix& kernel,
208  RealMatrix const& _gamma,
209  UIntVector const& _rho,
210  QpSparseArray<QpFloatType> const& _nu,
211  QpSparseArray<QpFloatType> const& _M,
212  bool sumToZeroConstraint)
213  : kernelMatrix(kernel)
214  , gamma(_gamma)
215  , rho(_rho)
216  , nu(_nu)
217  , M(_M)
218  , sumToZero(sumToZeroConstraint)
219  {
220  useShrinking = true;
221  examples = kernelMatrix.size();
222 
223  classes = gamma.size1();
224  cardP = rho.size();
225  cardR = 1;
226  unsigned int p;
227  for (p=0; p<cardP; p++) if (rho[p] >= cardR) cardR = rho[p] +1;
229 
230  SHARK_CHECK(
231  (gamma.size2() == cardP) &&
232  (nu.width() == classes) &&
233  (nu.height() == classes * cardP) &&
234  (M.width() == cardP) &&
235  (M.height() == classes * cardP * classes) &&
236  (cardP <= classes), // makes sense, but is this necessarily so?
237  "[QpMcDecomp::QpMcDecomp] dimension conflict"
238  );
239 
240  if (cardR != 1 && cardR != cardP)
241  {
242  // constraint of this specific implementation; for efficiency
243  throw SHARKEXCEPTION("[QpMcDecomp::QpMcDecomp] currently this solver supports only bijective or trivial rho");
244  }
245  }
246 
247 
248  //!
249  //! \brief solve the quadratic program
250  //!
251  //! \param target class labels in {0, ..., classes-1}
252  //! \param C regularization constant, upper bound on all variables
253  //! \param solutionAlpha input: initial feasible vector \f$ \alpha \f$; output: solution \f$ \alpha^* \f$
254  //! \param stop stopping condition(s)
255  //! \param prop solution properties (may be NULL)
256  //! \param solutionBias input: initial bias parameters \f$ b \f$; output: solution \f$ b^* \f$. If this parameter is NULL, then the corresponding problem without bias is solved.
257  //!
258  void solve(Data<unsigned int> const& target,
259  double C,
260  RealVector& solutionAlpha,
261  QpStoppingCondition& stop,
262  QpSolutionProperties* prop = NULL,
263  RealVector* solutionBias = NULL)
264  {
265  SIZE_CHECK(target.numberOfElements() == examples);
266  SIZE_CHECK(solutionAlpha.size() == variables);
267 
268  std::size_t v, w, i, e;
269  unsigned int p;
270 
271  double start_time = Timer::now();
272 
273  this->C = C;
274  alpha = solutionAlpha;
275  bias = solutionBias;
276 
277  double dualAccuracy = (bias == NULL) ? stop.minAccuracy : 0.5 * stop.minAccuracy;
278 
279  // prepare lists
280  linear.resize(variables);
281  gradient.resize(variables);
282  variable.resize(variables);
283  storage1.resize(variables);
284  storage2.resize(variables);
285  example.resize(examples);
286 
287  // prepare solver internal variables
288  activeEx = examples;
290  for (v=0, i=0; i<examples; i++)
291  {
292  unsigned int y = target.element(i);
293  example[i].index = i;
294  example[i].y = y;
295  example[i].active = cardP;
296  example[i].var = &storage1[cardP * i];
297  example[i].avar = &storage2[cardP * i];
298  example[i].varsum = 0.0;
299  double k = kernelMatrix.entry(i, i);
300  for (p=0; p<cardP; p++, v++)
301  {
302  variable[v].i = i;
303  variable[v].p = p;
304  variable[v].index = p;
305  double Q = M(classes * (y * cardP + p) + y, p) * k;
306  variable[v].diagonal = Q;
307  storage1[v] = v;
308  storage2[v] = v;
309  example[i].varsum += solutionAlpha(v);
310  double lin = gamma(y, p);
311  if (bias != NULL)
312  {
313  typename QpSparseArray<QpFloatType>::Row const& row = nu.row(y * cardP + p);
314  for (e=0; e<row.size; e++) lin -= row.entry[e].value * (*bias)(row.entry[e].index);
315  }
316  linear(v) = gradient(v) = lin;
317  }
318  }
319  SHARK_ASSERT(v == variables);
320 
321  // gradient initialization
322  e = (std::size_t)(-1); // invalid value
323  QpFloatType* q = NULL;
324  for (v=0, i=0; i<examples; i++)
325  {
326  unsigned int y = example[i].y;
327  for (p=0; p<cardP; p++, v++)
328  {
329  double av = alpha(v);
330  if (av != 0.0)
331  {
332  std::size_t iv = variable[v].i;
333  if (iv != e)
334  {
335  q = kernelMatrix.row(iv, 0, activeEx);
336  e = iv;
337  }
338  unsigned int r = y*cardP+p;
339  GRADIENT_UPDATE(r, 0, activeEx, av, q);
340  }
341  }
342  }
343 
344  if (bias != NULL) initializeLP();
345 
346  bUnshrinked = false;
348 
349  // initial shrinking (useful for dummy variables and warm starts)
350  if (useShrinking) shrink(stop.minAccuracy);
351 
352  // decomposition loop
353  unsigned long long iter = 0;
354  while (iter != stop.maxIterations)
355  {
356  // select a working set and check for optimality
357  double acc = selectWorkingSet(v, w);
358  if (acc < dualAccuracy)
359  {
360  // seems to be optimal
361 
362  if (useShrinking)
363  {
364  // do costly unshrinking
365  unshrink(dualAccuracy, true);
366 
367  // check again on the whole problem
368  if (checkKKT() < dualAccuracy)
369  {
370  if (bias != NULL)
371  {
372  solveForBias(dualAccuracy);
373  if (checkKKT() < stop.minAccuracy)
374  {
375  if (prop != NULL) prop->type = QpAccuracyReached;
376  break;
377  }
378  }
379  else
380  {
381  if (prop != NULL) prop->type = QpAccuracyReached;
382  break;
383  }
384  }
385 
386  shrink(stop.minAccuracy);
388  }
389  else
390  {
391  if (bias != NULL)
392  {
393  solveForBias(dualAccuracy);
394  if (checkKKT() < stop.minAccuracy)
395  {
396  if (prop != NULL) prop->type = QpAccuracyReached;
397  break;
398  }
399  }
400  else
401  {
402  if (prop != NULL) prop->type = QpAccuracyReached;
403  break;
404  }
405  }
406 
407  selectWorkingSet(v, w);
408  }
409 
410  // update
411  if (v == w)
412  {
413  // Limit case of a single variable;
414  // this means that there is only one
415  // non-optimal variable left.
416  std::size_t i = variable[v].i;
417  unsigned int p = variable[v].p;
418  unsigned int y = example[i].y;
419  unsigned int r = cardP * y + p;
420  QpFloatType* q = kernelMatrix.row(i, 0, activeEx);
421  double Qvv = variable[v].diagonal;
422  double mu = gradient(v) / Qvv;
423  if (mu < 0.0)
424  {
425  if (mu <= -alpha(v))
426  {
427  mu = -alpha(v);
428  alpha(v) = 0.0;
429  }
430  else alpha(v) += mu;
431  if (cardR < cardP) example[i].varsum += mu;
432  }
433  else
434  {
435  if (cardR < cardP)
436  {
437  double& varsum = example[i].varsum;
438  double max_mu = C - varsum;
439  double max_alpha = max_mu + alpha(v);
440  if (mu >= max_mu)
441  {
442  mu = max_mu;
443  alpha(v) = max_alpha;
444  varsum = C;
445  }
446  else
447  {
448  alpha(v) += mu;
449  varsum += mu;
450  }
451  }
452  else
453  {
454  if (mu >= C - alpha(v))
455  {
456  mu = C - alpha(v);
457  alpha(v) = C;
458  }
459  else alpha(v) += mu;
460  }
461  }
462  GRADIENT_UPDATE(r, 0, activeEx, mu, q);
463  }
464  else
465  {
466  // S2DO
467  std::size_t iv = variable[v].i;
468  unsigned int pv = variable[v].p;
469  unsigned int yv = example[iv].y;
470 
471  std::size_t iw = variable[w].i;
472  unsigned int pw = variable[w].p;
473  unsigned int yw = example[iw].y;
474 
475  // get the matrix rows corresponding to the working set
476  QpFloatType* qv = kernelMatrix.row(iv, 0, activeEx);
477  QpFloatType* qw = kernelMatrix.row(iw, 0, activeEx);
478  unsigned int rv = cardP*yv+pv;
479  unsigned int rw = cardP*yw+pw;
480 
481  // get the Q-matrix restricted to the working set
482  double Qvv = variable[v].diagonal;
483  double Qww = variable[w].diagonal;
484  double Qvw = M(classes * rv + yw, pw) * qv[iw];
485 
486  // solve the sub-problem
487  double mu_v = 0.0;
488  double mu_w = 0.0;
489 
490  if (cardR < cardP)
491  {
492  if (iv == iw)
493  {
495  example[iv].varsum,
496  gradient(v), gradient(w),
497  Qvv, Qvw, Qww,
498  mu_v, mu_w);
499  }
500  else
501  {
502  double& varsum1 = example[iv].varsum;
503  double& varsum2 = example[iw].varsum;
504  double U1 = C - (varsum1 - alpha(v));
505  double U2 = C - (varsum2 - alpha(w));
506  solve2D_box(alpha(v), alpha(w),
507  gradient(v), gradient(w),
508  Qvv, Qvw, Qww,
509  U1, U2,
510  mu_v, mu_w);
511 
512  // improve numerical stability:
513  if (alpha(v) == U1) varsum1 = C;
514  else varsum1 += mu_v;
515  if (alpha(w) == U2) varsum2 = C;
516  else varsum2 += mu_w;
517  }
518  }
519  else
520  {
521  solve2D_box(alpha(v), alpha(w),
522  gradient(v), gradient(w),
523  Qvv, Qvw, Qww,
524  C, C,
525  mu_v, mu_w);
526  }
527 
528  // update the gradient
529  GRADIENT_UPDATE(rv, 0, activeEx, mu_v, qv);
530  GRADIENT_UPDATE(rw, 0, activeEx, mu_w, qw);
531  }
532 
533  checkCounter--;
534  if (checkCounter == 0)
535  {
536  // shrink the problem
537  if (useShrinking) shrink(stop.minAccuracy);
538 
540 
541  if (stop.maxSeconds < 1e100)
542  {
543  double current_time = Timer::now();
544  if (current_time - start_time > stop.maxSeconds)
545  {
546  if (prop != NULL) prop->type = QpTimeout;
547  break;
548  }
549  }
550  }
551 
552  iter++;
553  }
554 
555  if (iter == stop.maxIterations)
556  {
557  if (prop != NULL) prop->type = QpMaxIterationsReached;
558  }
559 
560  // fill in the solution and compute the objective value
561 RealVector solutionGradient(variables);
562  double objective = 0.0;
563  for (v=0; v<variables; v++)
564  {
565  std::size_t w = cardP * example[variable[v].i].index + variable[v].p;
566  solutionAlpha(w) = alpha(v);
567 solutionGradient(w) = gradient(v);
568  objective += (gradient(v) + linear(v)) * alpha(v);
569  }
570  objective *= 0.5;
571 
572  double finish_time = Timer::now();
573 
574  if (prop != NULL)
575  {
576  prop->accuracy = checkKKT();
577  prop->value = objective;
578  prop->iterations = iter;
579  prop->seconds = finish_time - start_time;
580  }
581 /*{
582  // dump alphas
583  for (std::size_t i=0, e=0; i<examples; i++)
584  {
585  printf("target(%lu) = %u\n", i, target.element(i));
586  for (std::size_t c=0; c<classes; c++, e++)
587  {
588  printf("(%lu, %lu) alpha: %g \tgradient: %g\n", i, c, solutionAlpha(e), solutionGradient(e));
589  }
590  }
591 }*/
592 
593  }
594 
595  //!
596  //! \brief solve the quadratic program with SMO
597  //!
598  //! \param target class labels in {0, ..., classes-1}
599  //! \param C regularization constant, upper bound on all variables
600  //! \param solutionAlpha input: initial feasible vector \f$ \alpha \f$; output: solution \f$ \alpha^* \f$
601  //! \param stop stopping condition(s)
602  //! \param prop solution properties (may be NULL)
603  //! \param solutionBias input: initial bias parameters \f$ b \f$; output: solution \f$ b^* \f$. If this parameter is NULL, then the corresponding problem without bias is solved.
604  //!
605  void solveSMO(Data<unsigned int> const& target,
606  double C,
607  RealVector& solutionAlpha,
608  QpStoppingCondition& stop,
609  QpSolutionProperties* prop = NULL,
610  RealVector* solutionBias = NULL)
611  {
612  SIZE_CHECK(target.numberOfElements() == examples);
613  SIZE_CHECK(solutionAlpha.size() == variables);
614 
615  std::size_t v, w, i, e;
616  unsigned int p;
617 
618  double start_time = Timer::now();
619 
620  this->C = C;
621  alpha = solutionAlpha;
622  bias = solutionBias;
623 
624  double dualAccuracy = (bias == NULL) ? stop.minAccuracy : 0.5 * stop.minAccuracy;
625 
626  // prepare lists
627  linear.resize(variables);
628  gradient.resize(variables);
629  variable.resize(variables);
630  storage1.resize(variables);
631  storage2.resize(variables);
632  example.resize(examples);
633 
634  // prepare solver internal variables
635  activeEx = examples;
637  for (v=0, i=0; i<examples; i++)
638  {
639  unsigned int y = target.element(i);
640  example[i].index = i;
641  example[i].y = y;
642  example[i].active = cardP;
643  example[i].var = &storage1[cardP * i];
644  example[i].avar = &storage2[cardP * i];
645  example[i].varsum = 0.0;
646  for (p=0; p<cardP; p++, v++)
647  {
648  variable[v].i = i;
649  variable[v].p = p;
650  variable[v].index = p;
651  double Q = M(classes * (y * cardP + p) + y, p) * kernelMatrix.entry(i, i);
652  variable[v].diagonal = Q;
653  storage1[v] = v;
654  storage2[v] = v;
655  example[i].varsum += solutionAlpha(v);
656  double lin = gamma(y, p);
657  if (bias != NULL)
658  {
659  typename QpSparseArray<QpFloatType>::Row const& row = nu.row(y * cardP + p);
660  for (e=0; e<row.size; e++) lin -= row.entry[e].value * (*bias)(row.entry[e].index);
661  }
662  linear(v) = gradient(v) = lin;
663  }
664  }
665 
666  // gradient initialization
667  e = 0xffffffff;
668  QpFloatType* q = NULL;
669  for (v=0, i=0; i<examples; i++)
670  {
671  unsigned int y = example[i].y;
672  for (p=0; p<cardP; p++, v++)
673  {
674  double av = alpha(v);
675  if (av != 0.0)
676  {
677  std::size_t iv = variable[v].i;
678  if (iv != e)
679  {
680  q = kernelMatrix.row(iv, 0, activeEx);
681  e = iv;
682  }
683  unsigned int r = y*cardP+p;
684  GRADIENT_UPDATE(r, 0, activeEx, av, q);
685  }
686  }
687  }
688 
689  if (bias != NULL) initializeLP();
690 
691  bUnshrinked = false;
693 
694  // initial shrinking (useful for dummy variables and warm starts)
695  if (useShrinking) shrink(stop.minAccuracy);
696 
697  // decomposition loop
698  unsigned long long iter = 0;
699  while (iter != stop.maxIterations)
700  {
701  // select a working set and check for optimality
702  double acc = selectWorkingSetSMO(v, w);
703  if (acc < dualAccuracy)
704  {
705  // seems to be optimal
706 
707  if (useShrinking)
708  {
709  // do costly unshrinking
710  unshrink(dualAccuracy, true);
711 
712  // check again on the whole problem
713  if (checkKKT() < dualAccuracy)
714  {
715  if (bias != NULL)
716  {
717  solveForBias(dualAccuracy);
718  if (checkKKT() < stop.minAccuracy)
719  {
720  if (prop != NULL) prop->type = QpAccuracyReached;
721  break;
722  }
723  }
724  else
725  {
726  if (prop != NULL) prop->type = QpAccuracyReached;
727  break;
728  }
729  }
730 
731  shrink(stop.minAccuracy);
733  }
734  else
735  {
736  if (bias != NULL)
737  {
738  solveForBias(dualAccuracy);
739  if (checkKKT() < stop.minAccuracy)
740  {
741  if (prop != NULL) prop->type = QpAccuracyReached;
742  break;
743  }
744  }
745  else
746  {
747  if (prop != NULL) prop->type = QpAccuracyReached;
748  break;
749  }
750  }
751 
752  selectWorkingSetSMO(v, w);
753  }
754 
755  // update
756  {
757  std::size_t iv = variable[v].i;
758  unsigned int pv = variable[v].p;
759  unsigned int yv = example[iv].y;
760 
761 // std::size_t iw = variable[w].i;
762 // unsigned int pw = variable[w].p;
763 // unsigned int yw = example[iw].y;
764 
765  // get the matrix rows corresponding to the working set
766  QpFloatType* qv = kernelMatrix.row(iv, 0, activeEx);
767 // QpFloatType* qw = kernelMatrix.row(iw, 0, activeEx);
768  unsigned int rv = cardP*yv+pv;
769 // unsigned int rw = cardP*yw+pw;
770 
771  // get the Q-matrix restricted to the working set
772  double Qvv = variable[v].diagonal;
773 // double Qww = variable[w].diagonal;
774 // double Qvw = M(classes * rv + yw, pw) * qv[iw];
775 
776  // solve the sub-problem
777  double mu_v = 0.0;
778 // double mu_w = 0.0;
779 
780  if (cardR < cardP)
781  {
782  throw SHARKEXCEPTION("[QpMcDecomp::solveSMO] SMO is implemented only for box constraints");
783  }
784  else
785  {
786  if (v != w) throw SHARKEXCEPTION("[QpMcDecomp::solveSMO] internal error");
787  double gv = gradient(v);
788  if (Qvv == 0.0)
789  {
790  if (gv > 0.0)
791  {
792  mu_v = C - alpha(v);
793  alpha(v) = C;
794  }
795  else
796  {
797  mu_v = -alpha(v);
798  alpha(v) = 0.0;
799  }
800  }
801  else
802  {
803  mu_v = gv / Qvv;
804  double a = alpha(v) + mu_v;
805  if (a <= 0.0)
806  {
807  mu_v = -alpha(v);
808  alpha(v) = 0.0;
809  }
810  else if (a >= C)
811  {
812  mu_v = C - alpha(v);
813  alpha(v) = C;
814  }
815  else
816  {
817  alpha(v) = a;
818  }
819  }
820  }
821 
822  // update the gradient
823  GRADIENT_UPDATE(rv, 0, activeEx, mu_v, qv);
824 // GRADIENT_UPDATE(rw, 0, activeEx, mu_w, qw);
825  }
826 
827  checkCounter--;
828  if (checkCounter == 0)
829  {
830  // shrink the problem
831  if (useShrinking) shrink(stop.minAccuracy);
832 
834 
835  if (stop.maxSeconds < 1e100)
836  {
837  double current_time = Timer::now();
838  if (current_time - start_time > stop.maxSeconds)
839  {
840  if (prop != NULL) prop->type = QpTimeout;
841  break;
842  }
843  }
844  }
845 
846  iter++;
847  }
848 
849  if (iter == stop.maxIterations)
850  {
851  if (prop != NULL) prop->type = QpMaxIterationsReached;
852  }
853 
854  // fill in the solution and compute the objective value
855  double objective = 0.0;
856  for (v=0; v<variables; v++)
857  {
858  std::size_t w = cardP * example[variable[v].i].index + variable[v].p;
859  solutionAlpha(w) = alpha(v);
860  objective += (gradient(v) + linear(v)) * alpha(v);
861  }
862  objective *= 0.5;
863 
864  double finish_time = Timer::now();
865 
866  if (prop != NULL)
867  {
868  prop->accuracy = checkKKT();
869  prop->value = objective;
870  prop->iterations = iter;
871  prop->seconds = finish_time - start_time;
872  }
873  }
874 
875  //! enable/disable shrinking
876  void setShrinking(bool shrinking = true)
877  { useShrinking = shrinking; }
878 
879 protected:
880  /// Initialize the linear project member LP
881  /// for solving the problem with bias parameters.
883  {
884 /*
885  std::size_t rows = variables + 1;
886  std::size_t cols = examples*cardR + classes;
887  std::size_t v, xi, i, r, p, b, c, y;
888 
889  // prepare the linear program description
890  lp.setMinimize();
891  lp.addRows(rows);
892  lp.addColumns(cols);
893 
894  // slack variables and box and margin constraints
895  std::vector<unsigned int> row(1); // row indices
896  std::vector<unsigned int> col(1); // column indices
897  std::vector<double> value(1); // coefficients
898  unsigned int PperR = cardP / cardR;
899  for (v=0, xi=0, i=0; i<examples; i++)
900  {
901  y = example[i].y;
902  for (r=0; r<cardR; r++)
903  {
904  lp.setObjectiveCoefficient(1+xi+r, 1.0);
905  lp.setColumnLowerBounded(1+xi+r, 0.0);
906  }
907  for (p=0; p<cardP; p++)
908  {
909  r = p / PperR;
910  lp.setRowLowerBounded(1+v+p, gradient(v+p));
911 
912  // \xi_{i,p}
913  row.push_back(1+v+p);
914  col.push_back(1+xi+r);
915  value.push_back(1.0);
916  typename QpSparseArray<QpFloatType>::Row const& nu_row = nu.row(y * cardP + p);
917  for (b=0; b<nu_row.size; b++)
918  {
919  // \nu_{c,p,y_i} \Delta b_c
920  row.push_back(1+v+p);
921  col.push_back(1+examples*cardR + nu_row.entry[b].index);
922  value.push_back(nu_row.entry[b].value);
923  }
924  }
925  v += cardP;
926  xi += cardR;
927  }
928 
929  // bias parameters and equality constraint
930  if (sumToZero) lp.setRowFixed(1+v, 0.0); // sum-to-zero constraint
931  for (c=0; c<classes; c++)
932  {
933  lp.setColumnFree(1+xi+c);
934  lp.setObjectiveCoefficient(1+xi+c, 0.0);
935 
936  if (sumToZero)
937  {
938  // b_c
939  row.push_back(1+v);
940  col.push_back(1+xi+c);
941  value.push_back(1.0);
942  }
943  }
944 
945  // set the matrix connecting variables and constraints
946  lp.setConstraintMatrix(row, col, value);
947 */
948  }
949 
950  //! Solve the primal problem with fixed weight vectors
951  //! for the bias variables (and the slack variables,
952  //! but these are ignored).
953  void solveForBias(double epsilon)
954  {
955  SHARK_CHECK(bias != NULL, "[QpMcDecomp::solveForBias] internal error");
956 
957  std::size_t i, v, b;
958  unsigned int p, c;
959  RealVector stepsize(classes,epsilon);
960  RealVector prev(classes,0.0);
961  RealVector step(classes);
962 
963  // Rprop loop
964  while (true)
965  {
966  // compute the primal gradient w.r.t. bias
967  RealVector grad(classes,0.0);
968  if (cardR < cardP)
969  {
970  // simplex case
971  for (i=0; i<examples; i++)
972  {
973  tExample& ex = example[i];
974  unsigned int largest_p = cardP;
975  double largest_value = 0.0;
976  for (p=0; p<cardP; p++)
977  {
978  std::size_t v = ex.var[p];
979  SHARK_ASSERT(v < activeVar);
980  double g = gradient(v);
981  if (g > largest_value)
982  {
983  largest_value = g;
984  largest_p = p;
985  }
986  }
987  if (largest_p < cardP)
988  {
989  typename QpSparseArray<QpFloatType>::Row const& row = nu.row(ex.y * cardP + largest_p);
990  for (b=0; b<row.size; b++) grad(row.entry[b].index) -= row.entry[b].value;
991  }
992  }
993  }
994  else
995  {
996  // box case
997  for (v=0; v<variables; v++)
998  {
999  double g = gradient(v);
1000  if (g > 0.0)
1001  {
1002  tVariable& var = variable[v];
1003  tExample& ex = example[var.i];
1004  typename QpSparseArray<QpFloatType>::Row const& row = nu.row(ex.y * cardP + var.p);
1005  for (b=0; b<row.size; b++) grad(row.entry[b].index) -= row.entry[b].value;
1006  }
1007  }
1008  }
1009 
1010  if (sumToZero)
1011  {
1012  // project the gradient
1013  grad -= sum(grad) / classes;
1014  }
1015 
1016  // Rprop
1017  for (c=0; c<classes; c++)
1018  {
1019  double g = grad(c);
1020  if (g > 0.0) step(c) = -stepsize(c);
1021  else if (g < 0.0) step(c) = stepsize(c);
1022 
1023  double gg = prev(c) * grad(c);
1024  if (gg > 0.0) stepsize(c) *= 1.2;
1025  else stepsize(c) *= 0.5;
1026  }
1027  prev = grad;
1028 
1029  if (sumToZero)
1030  {
1031  // project the step
1032  step -= sum(step) / classes;
1033  }
1034 
1035  // update the solution and the dual gradient
1036  (*bias) += step;
1037  for (v=0; v<variables; v++)
1038  {
1039  tVariable& var = variable[v];
1040  tExample& ex = example[var.i];
1041 
1042  // delta = \sum_m \nu_{m,p,y_i} \Delta b(m)
1043  typename QpSparseArray<QpFloatType>::Row const& row = nu.row(ex.y * cardP + var.p);
1044  double delta = 0.0;
1045  for (b=0; b<row.size; b++)
1046  {
1047  delta += row.entry[b].value * step(row.entry[b].index);
1048  }
1049  gradient(v) -= delta;
1050  linear(v) -= delta;
1051  }
1052 
1053  // stopping criterion
1054  double maxstep = 0.0;
1055  for (c=0; c<classes; c++) if (stepsize(c) > maxstep) maxstep = stepsize(c);
1056  if (maxstep < 0.01 * epsilon) break;
1057  }
1058 
1059 /*
1060  std::size_t v, xi, i, b;
1061 // unsigned int c, p, r, PperR = cardP / cardR;
1062 
1063  // modify lower bounds
1064  for (v=0, i=0; i<examples; i++)
1065  {
1066  tExample& ex = example[i];
1067  for (p=0; p<cardP; p++, v++)
1068  {
1069  lp.setRowLowerBounded(1+v, gradient(ex.var[p]));
1070  }
1071  }
1072 
1073  // define initial vertex
1074  if (cardR == cardP)
1075  {
1076  // box case
1077  for (v=0, xi=0, i=0; i<examples; i++)
1078  {
1079  for (p=0; p<cardP; p++, v++)
1080  {
1081  if (alpha(v) == C)
1082  {
1083  lp.setRowStatus(1+v, false);
1084  lp.setColumnStatus(1+v, true);
1085  }
1086  else
1087  {
1088  lp.setRowStatus(1+v, true);
1089  lp.setColumnStatus(1+v, false);
1090  }
1091  }
1092  }
1093 
1094  if (sumToZero) lp.setRowStatus(1+variables, true);
1095  for (c=0; c<classes; c++) lp.setColumnStatus(1+examples*cardR+c, false);
1096  }
1097  else
1098  {
1099  // simplex case
1100 // throw SHARKEXCEPTION("[solveForBias] simplex case not implemented yet");
1101  // TODO!!!
1102 
1103 // if (sumToZero) lp.setRowStatus(1+variables, true);
1104 // for (c=0; c<classes; c++) lp.setColumnStatus(1+examples*cardR+c, false);
1105  }
1106 
1107  // solve the LP
1108  lp.solve();
1109 
1110  // read out the solution
1111  RealVector diff(classes);
1112  for (c=0; c<classes; c++) diff(c) = lp.solution(1+examples*cardR+c);
1113 
1114  // update the solution and the dual gradient
1115  (*bias) += diff;
1116  for (v=0; v<variables; v++)
1117  {
1118  tVariable& var = variable[v];
1119  tExample& ex = example[var.i];
1120 
1121  // delta = \sum_m \nu_{m,p,y_i} \Delta b(m)
1122  typename QpSparseArray<QpFloatType>::Row const& row = nu.row(ex.y * cardP + var.p);
1123  double delta = 0.0;
1124  for (b=0; b<row.size; b++)
1125  {
1126  delta += row.entry[b].value * diff(row.entry[b].index);
1127  }
1128  gradient(v) -= delta;
1129  linear(v) -= delta;
1130  }
1131 printf("[solveForBias] diff=(");
1132 for (c=0; c<classes; c++) printf(" %g", diff(c));
1133 printf(")\n");
1134 printf("[solveForBias] b=(");
1135 for (c=0; c<classes; c++) printf(" %g", (*bias)(c));
1136 printf(")\n");
1137 */
1138  }
1139 
1140  //! Exact solver for the one-dimensional sub-problem<br>
1141  //! maximize \f$ g \alpha - Q/2 \mu^2 \f$<br>
1142  //! such that \f$ 0 \leq \alpha \leq U \f$<br>
1143  //! The method returns the optimal alpha as well as
1144  //! the step mu leading to the update
1145  //! \f$ \alpha \leftarrow \alpha + \mu \f$.
1146  void solveEdge(double& alpha, double g, double Q, double U, double& mu)
1147  {
1148  mu = g / Q;
1149  if (! boost::math::isnormal(mu))
1150  {
1151  if (g > 0.0)
1152  {
1153  mu = U - alpha;
1154  alpha = U;
1155  }
1156  else
1157  {
1158  mu = -alpha;
1159  alpha = 0.0;
1160  }
1161  }
1162  else
1163  {
1164  double a = alpha + mu;
1165  if (a <= 0.0)
1166  {
1167  mu = -alpha;
1168  alpha = 0.0;
1169  }
1170  else if (a >= U)
1171  {
1172  mu = U - alpha;
1173  alpha = U;
1174  }
1175  else
1176  {
1177  alpha = a;
1178  }
1179  }
1180  }
1181 
1182  //! Exact solver for the S2DO problem for the case
1183  //! that the sub-problem has box-constraints. The
1184  //! method updates alpha and in addition returns
1185  //! the step mu.
1186  void solve2D_box(double& alphai, double& alphaj,
1187  double gi, double gj,
1188  double Qii, double Qij, double Qjj,
1189  double Ui, double Uj,
1190  double& mui, double& muj)
1191  {
1192  // try the free solution first
1193  double detQ = Qii * Qjj - Qij * Qij;
1194  mui = (Qjj * gi - Qij * gj) / detQ;
1195  muj = (Qii * gj - Qij * gi) / detQ;
1196  double opti = alphai + mui;
1197  double optj = alphaj + muj;
1198  if (boost::math::isnormal(opti) && boost::math::isnormal(optj) && opti > 0.0 && optj > 0.0 && opti < Ui && optj < Uj)
1199  {
1200  alphai = opti;
1201  alphaj = optj;
1202  return;
1203  }
1204 
1205  // otherwise process all edges
1206  struct tEdgeSolution
1207  {
1208  double alphai;
1209  double alphaj;
1210  double mui;
1211  double muj;
1212  };
1213  tEdgeSolution solution[4];
1214  tEdgeSolution* sol = solution;
1215  tEdgeSolution* best = solution;
1216  double gain, bestgain = 0.0;
1217  // edge \alpha_1 = 0
1218  {
1219  sol->alphai = 0.0;
1220  sol->alphaj = alphaj;
1221  sol->mui = -alphai;
1222  solveEdge(sol->alphaj, gj + Qij * alphai, Qjj, Uj, sol->muj);
1223  gain = sol->mui * (gi - 0.5 * (Qii*sol->mui + Qij*sol->muj))
1224  + sol->muj * (gj - 0.5 * (Qij*sol->mui + Qjj*sol->muj));
1225  if (gain > bestgain) { bestgain = gain; best = sol; }
1226  sol++;
1227  }
1228  // edge \alpha_2 = 0
1229  {
1230  sol->alphai = alphai;
1231  sol->alphaj = 0.0;
1232  sol->muj = -alphaj;
1233  solveEdge(sol->alphai, gi + Qij * alphaj, Qii, Ui, sol->mui);
1234  gain = sol->mui * (gi - 0.5 * (Qii*sol->mui + Qij*sol->muj))
1235  + sol->muj * (gj - 0.5 * (Qij*sol->mui + Qjj*sol->muj));
1236  if (gain > bestgain) { bestgain = gain; best = sol; }
1237  sol++;
1238  }
1239  // edge \alpha_1 = U_1
1240  {
1241  sol->alphai = Ui;
1242  sol->alphaj = alphaj;
1243  sol->mui = Ui - alphai;
1244  solveEdge(sol->alphaj, gj - Qij * sol->mui, Qjj, Uj, sol->muj);
1245  gain = sol->mui * (gi - 0.5 * (Qii*sol->mui + Qij*sol->muj))
1246  + sol->muj * (gj - 0.5 * (Qij*sol->mui + Qjj*sol->muj));
1247  if (gain > bestgain) { bestgain = gain; best = sol; }
1248  sol++;
1249  }
1250  // edge \alpha_2 = U_2
1251  {
1252  sol->alphai = alphai;
1253  sol->alphaj = Uj;
1254  sol->muj = Uj - alphaj;
1255  solveEdge(sol->alphai, gi - Qij * sol->muj, Qii, Ui, sol->mui);
1256  gain = sol->mui * (gi - 0.5 * (Qii*sol->mui + Qij*sol->muj))
1257  + sol->muj * (gj - 0.5 * (Qij*sol->mui + Qjj*sol->muj));
1258  if (gain > bestgain) { bestgain = gain; best = sol; }
1259  sol++;
1260  }
1261  alphai = best->alphai;
1262  alphaj = best->alphaj;
1263  mui = best->mui;
1264  muj = best->muj;
1265  }
1266 
1267  //! Exact solver for the S2DO problem for the case
1268  //! that the sub-problem has simplex-constraints.
1269  //! The method updates alpha and in addition returns
1270  //! the step mu.
1271  void solve2D_triangle(double& alphai, double& alphaj,
1272  double& alphasum,
1273  double gi, double gj,
1274  double Qii, double Qij, double Qjj,
1275  double& mui, double& muj)
1276  {
1277  // try the free solution first
1278  double V = C - alphasum;
1279  double U = V + alphai + alphaj;
1280  double detQ = Qii * Qjj - Qij * Qij;
1281  mui = (Qjj * gi - Qij * gj) / detQ;
1282  muj = (Qii * gj - Qij * gi) / detQ;
1283  double opti = alphai + mui;
1284  double optj = alphaj + muj;
1285  if (boost::math::isnormal(opti) && boost::math::isnormal(optj) && opti > 0.0 && optj > 0.0 && opti + optj < U)
1286  {
1287  alphai = opti;
1288  alphaj = optj;
1289  alphasum += mui + muj;
1290  if (alphasum > C) alphasum = C; // for numerical stability
1291  return;
1292  }
1293 
1294  // otherwise process all edges
1295  struct tEdgeSolution
1296  {
1297  double alphai;
1298  double alphaj;
1299  double alphasum;
1300  double mui;
1301  double muj;
1302  };
1303  tEdgeSolution solution[3];
1304  tEdgeSolution* sol = solution;
1305  tEdgeSolution* best = NULL;
1306  double gain, bestgain = -1e100;
1307  // edge \alpha_1 = 0
1308  {
1309  sol->alphai = 0.0;
1310  sol->alphaj = alphaj;
1311  sol->mui = -alphai;
1312  solveEdge(sol->alphaj, gj + Qij * alphai, Qjj, V + alphaj, sol->muj);
1313  sol->alphasum = alphasum + sol->mui + sol->muj;
1314  gain = sol->mui * (gi - 0.5 * (Qii*sol->mui + Qij*sol->muj))
1315  + sol->muj * (gj - 0.5 * (Qij*sol->mui + Qjj*sol->muj));
1316  if (gain > bestgain) { bestgain = gain; best = sol; }
1317  sol++;
1318  }
1319  // edge \alpha_2 = 0
1320  {
1321  sol->alphai = alphai;
1322  sol->alphaj = 0.0;
1323  sol->muj = -alphaj;
1324  solveEdge(sol->alphai, gi + Qij * alphaj, Qii, V + alphai, sol->mui);
1325  sol->alphasum = alphasum + sol->mui + sol->muj;
1326  gain = sol->mui * (gi - 0.5 * (Qii*sol->mui + Qij*sol->muj))
1327  + sol->muj * (gj - 0.5 * (Qij*sol->mui + Qjj*sol->muj));
1328  if (gain > bestgain) { bestgain = gain; best = sol; }
1329  sol++;
1330  }
1331  // edge \alpha_1 + \alpha_2 = U
1332  {
1333  double a = 0.0, mu = 0.0;
1334  double ggi = gi - (U - alphai) * Qii + alphaj * Qij;
1335  double ggj = gj - (U - alphai) * Qij + alphaj * Qjj;
1336  solveEdge(a, ggj - ggi, Qii + Qjj - 2.0 * Qij, U, mu);
1337  sol->alphai = U - a;
1338  sol->alphaj = a;
1339  sol->mui = U - a - alphai;
1340  sol->muj = a - alphaj;
1341  sol->alphasum = C;
1342  gain = sol->mui * (gi - 0.5 * (Qii*sol->mui + Qij*sol->muj))
1343  + sol->muj * (gj - 0.5 * (Qij*sol->mui + Qjj*sol->muj));
1344  if (gain > bestgain) { bestgain = gain; best = sol; }
1345  sol++;
1346  }
1347 
1348  alphai = best->alphai;
1349  alphaj = best->alphaj;
1350  alphasum = best->alphasum;
1351  mui = best->mui;
1352  muj = best->muj;
1353 
1354  // improve numerical stability:
1355  if (alphai + alphaj < 1e-12 * C) alphai = alphaj = alphasum = 0.0;
1356  if (alphasum > (1.0 - 1e-12) * C)
1357  {
1358  alphasum = C;
1359  if (alphai > (1.0 - 1e-12) * C) { alphai = C; alphaj = 0.0; }
1360  else if (alphaj > (1.0 - 1e-12) * C) { alphai = 0.0; alphaj = C; }
1361  }
1362  }
1363 /*
1364  //! return the largest KKT violation
1365  double kktViolationActive(unsigned int example)
1366  {
1367  }
1368 
1369  //! return the largest KKT violation
1370  double kktViolationAll(unsigned int example)
1371  {
1372  }
1373 */
1374  //! return the largest KKT violation
1375  double checkKKT()
1376  {
1377  if (cardR == cardP)
1378  {
1379  double ret = 0.0;
1380  std::size_t v;
1381  for (v=0; v<activeVar; v++)
1382  {
1383  double a = alpha(v);
1384  double g = gradient(v);
1385  if (a < C)
1386  {
1387  if (g > ret) ret = g;
1388  }
1389  if (a > 0.0)
1390  {
1391  if (-g > ret) ret = -g;
1392  }
1393  }
1394  return ret;
1395  }
1396  else
1397  {
1398  double ret = 0.0;
1399  std::size_t i, p, pc, v;
1400  for (i=0; i<activeEx; i++)
1401  {
1402  tExample& ex = example[i];
1403  pc = ex.active;
1404  bool cangrow = (ex.varsum < C);
1405  double up = -1e100;
1406  double down = 1e100;
1407  for (p=0; p<pc; p++)
1408  {
1409  v = ex.avar[p];
1410  SHARK_ASSERT(v < activeVar);
1411  double a = alpha(v);
1412  double g = gradient(v);
1413  if (cangrow)
1414  {
1415  if (g > up) up = g;
1416  }
1417  if (a > 0.0)
1418  {
1419  if (g < down) down = g;
1420  }
1421  }
1422  if (up - down > ret) ret = up - down;
1423  if (up > ret) ret = up;
1424  if (-down > ret) ret = -down;
1425  }
1426  return ret;
1427  }
1428  }
1429 
1430  //!
1431  //! \brief select the working set
1432  //!
1433  //! Select one or two variables for the sub-problem
1434  //! and return the maximal KKT violation. The method
1435  //! MAY select the same index for i and j. In that
1436  //! case the working set consists of a single variable.
1437  //! The working set may be invalid if the method reports
1438  //! a KKT violation of zero, indicating optimality.
1439  double selectWorkingSet(std::size_t& i, std::size_t& j)
1440  {
1441  if (cardR < cardP)
1442  {
1443  // simplex case
1444  double ret = 0.0;
1445 
1446  // first order selection
1447  std::size_t e;
1448  bool two = false;
1449  for (e=0; e<activeEx; e++)
1450  {
1451  tExample& ex = example[e];
1452  unsigned int b, bc = ex.active;
1453  if (ex.varsum == C)
1454  {
1455  unsigned int b2, a2;
1456  for (b=0; b<bc; b++)
1457  {
1458  std::size_t a = ex.avar[b];
1459  SHARK_ASSERT(a < activeVar);
1460  double aa = alpha(a);
1461  double mga = -gradient(a);
1462  if (aa > 0.0)
1463  {
1464  if (mga > ret)
1465  {
1466  ret = mga;
1467  i = a;
1468  two = false;
1469  }
1470  for (b2=0; b2<bc; b2++)
1471  {
1472  a2 = ex.avar[b2];
1473  SHARK_ASSERT(a2 < activeVar);
1474  double g2 = gradient(a2) + mga;
1475  if (g2 > ret)
1476  {
1477  ret = g2;
1478  i = a;
1479  j = a2;
1480  two = true;
1481  }
1482  }
1483  }
1484  }
1485  }
1486  else
1487  {
1488  double up = -1e100;
1489  double down = 1e100;
1490  std::size_t i_up = activeVar, i_down = activeVar;
1491  for (b=0; b<bc; b++)
1492  {
1493  std::size_t v = ex.avar[b];
1494  SHARK_ASSERT(v < activeVar);
1495  double a = alpha(v);
1496  double g = gradient(v);
1497  if (g > up) { i_up = v; up = g; }
1498  if (a > 0.0 && g < down) { i_down = v; down = g; }
1499  }
1500  if (up - down > ret) { two = true; ret = up - down; i = i_up; j = i_down; }
1501  if (up > ret) { two = false; ret = up; i = i_up; }
1502  if (-down > ret) { two = false; ret = -down; i = i_down; }
1503 /*
1504 // old version (wrong, not checking combined working set for KKT)
1505  for (b=0; b<bc; b++)
1506  {
1507  std::size_t a = ex.avar[b];
1508  SHARK_ASSERT(a < activeVar);
1509  double aa = alpha(a);
1510  double ga = gradient(a);
1511  if (ga > ret)
1512  {
1513  ret = ga;
1514  i = a;
1515  two = false;
1516  }
1517  else if (-ga > ret && aa > 0.0)
1518  {
1519  ret = -ga;
1520  i = a;
1521  two = false;
1522  }
1523  }
1524 */
1525  }
1526  }
1527  if (two || ret == 0.0) return ret;
1528 
1529  // second order selection
1530  std::size_t b, f, pf;
1531  tVariable& vari = variable[i];
1532  std::size_t ii = vari.i;
1533  unsigned int pi = vari.p;
1534  unsigned int yi = example[ii].y;
1535  double di = vari.diagonal;
1536  double gi = gradient(i);
1537  QpFloatType* k = kernelMatrix.row(ii, 0, activeEx);
1538  j = i;
1539  double bestgain = 0.0;
1540  double gain_i = gi * gi / di;
1541  std::size_t a;
1542  for (a=0; a<activeEx; a++)
1543  {
1544  tExample& exa = example[a];
1545  double varsum = exa.varsum;
1546  unsigned int ya = exa.y;
1547  QpFloatType kiia = k[a];
1548  typename QpSparseArray<QpFloatType>::Row const& row = M.row(classes * (yi * cardP + pi) + ya);
1549  QpFloatType def = row.defaultvalue;
1550  if (def == 0.0)
1551  {
1552  for (pf=0, b=0; b<row.size; b++)
1553  {
1554  for (; pf<row.entry[b].index; pf++)
1555  {
1556  f = exa.var[pf];
1557  if (f >= activeVar) continue;
1558  double af = alpha(f);
1559  double gf = gradient(f);
1560  if ((af > 0.0 && gf < 0.0) || (varsum < C && gf > 0.0))
1561  {
1562  double df = variable[f].diagonal;
1563  double gain = gain_i + gf * gf / df;
1564  if (gain > bestgain && f != i) { bestgain = gain; j = f; }
1565  }
1566  }
1567  {
1568  GAIN_SELECTION_TRIANGLE(row.entry[b].value * kiia);
1569  pf++;
1570  }
1571  }
1572  for (; pf<cardP; pf++)
1573  {
1574  f = exa.var[pf];
1575  if (f >= activeVar) continue;
1576  double af = alpha(f);
1577  double gf = gradient(f);
1578  if ((af > 0.0 && gf < 0.0) || (varsum < C && gf > 0.0))
1579  {
1580  double df = variable[f].diagonal;
1581  double gain = gain_i + gf * gf / df;
1582  if (gain > bestgain && f != i) { bestgain = gain; j = f; }
1583  }
1584  }
1585  }
1586  else
1587  {
1588  for (pf=0, b=0; b<row.size; b++)
1589  {
1590  for (; pf<row.entry[b].index; pf++)
1591  {
1592  GAIN_SELECTION_TRIANGLE(def * kiia);
1593  }
1594  {
1595  GAIN_SELECTION_TRIANGLE(row.entry[b].value * kiia);
1596  pf++;
1597  }
1598  }
1599  for (; pf<cardP; pf++)
1600  {
1601  GAIN_SELECTION_TRIANGLE(def * kiia);
1602  }
1603  }
1604  }
1605 
1606  return ret;
1607  }
1608  else
1609  {
1610  // box case
1611  double ret = 0.0;
1612 
1613  // first order selection
1614  std::size_t a;
1615  for (a=0; a<activeVar; a++)
1616  {
1617  double aa = alpha(a);
1618  double ga = gradient(a);
1619  if (ga > ret && aa < C)
1620  {
1621  ret = ga;
1622  i = a;
1623  }
1624  else if (-ga > ret && aa > 0.0)
1625  {
1626  ret = -ga;
1627  i = a;
1628  }
1629  }
1630  if (ret == 0.0) return ret;
1631 
1632  // second order selection
1633  std::size_t b, f, pf;
1634  tVariable& vari = variable[i];
1635  std::size_t ii = vari.i;
1636  unsigned int pi = vari.p;
1637  unsigned int yi = example[ii].y;
1638  double di = vari.diagonal;
1639  double gi = gradient(i);
1640  QpFloatType* k = kernelMatrix.row(ii, 0, activeEx);
1641  j = i;
1642  double bestgain = 0.0;
1643  double gain_i = gi * gi / di;
1644  for (a=0; a<activeEx; a++)
1645  {
1646  tExample& exa = example[a];
1647  unsigned int ya = exa.y;
1648  QpFloatType kiia = k[a];
1649  typename QpSparseArray<QpFloatType>::Row const& row = M.row(classes * (yi * cardP + pi) + ya);
1650  QpFloatType def = row.defaultvalue;
1651  if (def == 0.0)
1652  {
1653  for (pf=0, b=0; b<row.size; b++)
1654  {
1655  for (; pf<row.entry[b].index; pf++)
1656  {
1657  f = exa.var[pf];
1658  if (f >= activeVar) continue;
1659  double af = alpha(f);
1660  double gf = gradient(f);
1661  if ((af > 0.0 && gf < 0.0) || (af < C && gf > 0.0))
1662  {
1663  double df = variable[f].diagonal;
1664  double gain = gain_i + gf * gf / df;
1665  if (gain > bestgain && f != i) { bestgain = gain; j = f; }
1666  }
1667  }
1668  {
1669  GAIN_SELECTION_BOX(row.entry[b].value * kiia);
1670  pf++;
1671  }
1672  }
1673  for (; pf<cardP; pf++)
1674  {
1675  f = exa.var[pf];
1676  if (f >= activeVar) continue;
1677  double af = alpha(f);
1678  double gf = gradient(f);
1679  if ((af > 0.0 && gf < 0.0) || (af < C && gf > 0.0))
1680  {
1681  double df = variable[f].diagonal;
1682  double gain = gain_i + gf * gf / df;
1683  if (gain > bestgain && f != i) { bestgain = gain; j = f; }
1684  }
1685  }
1686  }
1687  else
1688  {
1689  for (pf=0, b=0; b<row.size; b++)
1690  {
1691  for (; pf<row.entry[b].index; pf++)
1692  {
1693  GAIN_SELECTION_BOX(def * kiia);
1694  }
1695  {
1696  GAIN_SELECTION_BOX(row.entry[b].value * kiia);
1697  pf++;
1698  }
1699  }
1700  for (; pf<cardP; pf++)
1701  {
1702  GAIN_SELECTION_BOX(def * kiia);
1703  }
1704  }
1705  }
1706 
1707  return ret;
1708  }
1709  }
1710 
1711  //!
1712  //! \brief select the working set for SMO
1713  //!
1714  //! Select one or two variables for the sub-problem
1715  //! and return the maximal KKT violation. The method
1716  //! MAY select the same index for i and j. In that
1717  //! case the working set consists of a single variable.
1718  //! The working set may be invalid if the method reports
1719  //! a KKT violation of zero, indicating optimality.
1720  double selectWorkingSetSMO(std::size_t& i, std::size_t& j)
1721  {
1722  if (cardR < cardP)
1723  {
1724  // simplex case
1725  throw SHARKEXCEPTION("[QpMcDecomp::selectWorkingSetSMO] SMO is implemented only for box constraints");
1726  }
1727  else
1728  {
1729  // box case
1730  double ret = 0.0;
1731 
1732  // second order selection
1733  std::size_t a;
1734  double bestgain = 0.0;
1735  for (a=0; a<activeVar; a++)
1736  {
1737  double aa = alpha(a);
1738  double ga = gradient(a);
1739  if (ga > 0.0 && aa < C)
1740  {
1741  double gain = ga * ga / variable[a].diagonal;
1742  if (gain > bestgain)
1743  {
1744  i = a;
1745  bestgain = gain;
1746  }
1747  if (ga > ret) ret = ga;
1748  }
1749  else if (ga < 0.0 && aa > 0.0)
1750  {
1751  double gain = ga * ga / variable[a].diagonal;
1752  if (gain > bestgain)
1753  {
1754  i = a;
1755  bestgain = gain;
1756  }
1757  if (-ga > ret) ret = -ga;
1758  }
1759  }
1760  j = i;
1761  return ret;
1762  }
1763  }
1764 
1765  //! Shrink the problem
1766  void shrink(double epsilon)
1767  {
1768  int a;
1769  double v, g;
1770 
1771  if (! bUnshrinked)
1772  {
1773  double largest = 0.0;
1774  for (a = 0; a < (int)activeVar; a++)
1775  {
1776  if (alpha(a) < C)
1777  {
1778  if (gradient(a) > largest) largest = gradient(a);
1779  }
1780  if (alpha(a) > 0.0)
1781  {
1782  if (-gradient(a) > largest) largest = -gradient(a);
1783  }
1784  }
1785  if (largest < 10.0 * epsilon)
1786  {
1787  // unshrink the problem at this accuracy level
1788  unshrink(epsilon, false);
1789  bUnshrinked = true;
1790  return;
1791  }
1792  }
1793 
1794  // shrink variables
1795  bool se = false;
1796  for (a = activeVar - 1; a >= 0; a--)
1797  {
1798  v = alpha(a);
1799  g = gradient(a);
1800 
1801  if ((v == 0.0 && g <= 0.0) || (v == C && g >= 0.0))
1802  {
1803  // In this moment no feasible step including this variable
1804  // can improve the objective. Thus deactivate the variable.
1805  std::size_t e = variable[a].i;
1806  deactivateVariable(a);
1807  if (example[e].active == 0)
1808  {
1809  se = true;
1810  }
1811  }
1812  }
1813 
1814  if (se)
1815  {
1816  // exchange examples such that shrinked examples
1817  // are moved to the ends of the lists
1818  for (a = activeEx - 1; a >= 0; a--)
1819  {
1820  if (example[a].active == 0) deactivateExample(a);
1821  }
1822 
1823  // shrink the corresponding cache entries
1824  //~ for (a = 0; a < (int)activeEx; a++)
1825  //~ {
1826  //~ if (kernelMatrix.getCacheRowSize(a) > activeEx) kernelMatrix.cacheRowResize(a, activeEx);
1827  //~ }
1828  //todo: mt: new shrinking action -> test & verify, remove above 3 lines
1829  //kernelMatrix.setTruncationIndex( activeEx );
1830  kernelMatrix.setMaxCachedIndex(activeEx);
1831  }
1832  }
1833 
1834  //! Activate all variables
1835  void unshrink(double epsilon, bool complete)
1836  {
1837  if (activeVar == variables) return;
1838 
1839  std::size_t v, i;
1840  double mu;
1841 
1842  // compute the inactive gradient components (quadratic time complexity)
1843  RealVectorRange(gradient, Range(activeVar, variables)) = ConstRealVectorRange(linear, Range(activeVar, variables));
1844 // for (v=activeVar; v<variables; v++)
1845 // {
1846 // gradient(v) = linear(v);
1847 // }
1848  for (v=0; v<variables; v++)
1849  {
1850  mu = alpha(v);
1851  if (mu == 0.0) continue;
1852 
1853  std::size_t iv = variable[v].i;
1854  unsigned int pv = variable[v].p;
1855  unsigned int yv = example[iv].y;
1856  unsigned int r = cardP * yv + pv;
1857  std::vector<QpFloatType> q(examples);
1858  kernelMatrix.row(iv, 0, examples, &q[0]);
1859 
1860  std::size_t a, b, f;
1861  for (a=0; a<examples; a++)
1862  {
1863  double k = (q)[a];
1864  tExample& ex = example[a];
1865  typename QpSparseArray<QpFloatType>::Row const& row = M.row(classes * r + ex.y);
1866  QpFloatType def = row.defaultvalue;
1867  if (def == 0.0)
1868  {
1869  for (b=0; b<row.size; b++)
1870  {
1871  f = ex.var[row.entry[b].index];
1872  if (f >= activeVar) gradient(f) -= mu * row.entry[b].value * k;
1873  }
1874  }
1875  else
1876  {
1877  for (b=0; b<row.size; b++)
1878  {
1879  f = ex.var[row.entry[b].index];
1880  if (f >= activeVar) gradient(f) -= mu * (row.entry[b].value - def) * k;
1881  }
1882  double upd = (mu) * def * (k);
1883  for (b=ex.active; b<cardP; b++)
1884  {
1885  f = ex.avar[b];
1886  SHARK_ASSERT(f >= activeVar);
1887  gradient(f) -= upd;
1888  }
1889  }
1890  }
1891  }
1892 
1893  for (i=0; i<examples; i++) example[i].active = cardP;
1894  activeEx = examples;
1895  activeVar = variables;
1896  //todo: mt: activate line below (new unshrink action) -> verify & test
1897  //kernelMatrix.setTruncationIndex( activeEx ); //disable cache truncation again
1898 
1899 
1900  if (! complete) shrink(epsilon);
1901  }
1902 
1903  //! true if the problem has already been unshrinked
1905 
1906  //! shrink a variable
1907  void deactivateVariable(std::size_t v)
1908  {
1909  std::size_t ev = variable[v].i;
1910  unsigned int iv = variable[v].index;
1911  unsigned int pv = variable[v].p;
1912  tExample* exv = &example[ev];
1913 
1914  std::size_t ih = exv->active - 1;
1915  std::size_t h = exv->avar[ih];
1916  variable[v].index = ih;
1917  variable[h].index = iv;
1918  std::swap(exv->avar[iv], exv->avar[ih]);
1919  iv = ih;
1920  exv->active--;
1921 
1922  std::size_t j = activeVar - 1;
1923  std::size_t ej = variable[j].i;
1924  unsigned int ij = variable[j].index;
1925  unsigned int pj = variable[j].p;
1926  tExample* exj = &example[ej];
1927 
1928  // exchange entries in the lists
1929  std::swap(alpha(v), alpha(j));
1930  std::swap(gradient(v), gradient(j));
1931  std::swap(linear(v), linear(j));
1932  std::swap(variable[v], variable[j]);
1933 
1934  variable[exv->avar[iv]].index = ij;
1935  variable[exj->avar[ij]].index = iv;
1936  exv->avar[iv] = j;
1937  exv->var[pv] = j;
1938  exj->avar[ij] = v;
1939  exj->var[pj] = v;
1940 
1941  activeVar--;
1942  }
1943 
1944  //! shrink an examples
1945  void deactivateExample(std::size_t e)
1946  {
1947  SHARK_ASSERT(e < activeEx);
1948  std::size_t j = activeEx - 1;
1949 
1950  std::swap(example[e], example[j]);
1951 
1952  std::size_t v;
1953  std::size_t* pe = example[e].var;
1954  std::size_t* pj = example[j].var;
1955  for (v = 0; v < cardP; v++)
1956  {
1957  SHARK_ASSERT(pj[v] >= activeVar);
1958  variable[pe[v]].i = e;
1959  variable[pj[v]].i = j;
1960  }
1961 
1962  // notify the matrix cache
1963  //kernelMatrix.cacheRowRelease(e);
1964  //todo: mt: new shrinking action. test & verify, then delete line above
1965  //kernelMatrix.cacheRedeclareOldest(e);
1966  kernelMatrix.flipColumnsAndRows(e, j);
1967 
1968  activeEx--;
1969  }
1970 
1971  //! data structure describing one variable of the problem
1972  struct tVariable
1973  {
1974  std::size_t i; // index into the example list
1975  unsigned int p; // constraint corresponding to this variable
1976  unsigned int index; // index into example->variables
1977  double diagonal; // diagonal entry of the big Q-matrix
1978  };
1979 
1980  //! data structure describing one training example
1981  struct tExample
1982  {
1983  std::size_t index; // example index in the dataset, not the example vector!
1984  unsigned int y; // label of this example
1985  unsigned int active; // number of active variables
1986  std::size_t* var; // list of all cardP variables, in order of the p-index
1987  std::size_t* avar; // list of active variables
1988  double varsum; // sum of all variables corresponding to this example
1989  };
1990 
1991  //! information about each training example
1992  std::vector<tExample> example;
1993 
1994  //! information about each variable of the problem
1995  std::vector<tVariable> variable;
1996 
1997  //! space for the example[i].var pointers
1998  std::vector<std::size_t> storage1;
1999 
2000  //! space for the example[i].avar pointers
2001  std::vector<std::size_t> storage2;
2002 
2003  //! number of examples in the problem (size of the kernel matrix)
2004  std::size_t examples;
2005 
2006  //! number of classes in the problem
2007  unsigned int classes;
2008 
2009  //! number of dual variables per example
2010  unsigned int cardP;
2011 
2012  //! number of slack variables per example
2013  unsigned int cardR;
2014 
2015  //! number of variables in the problem = examples times cardP
2016  std::size_t variables;
2017 
2018  //! kernel matrix (precomputed matrix or matrix cache)
2019  Matrix& kernelMatrix;
2020 
2021  //! target margins
2022  RealMatrix const& gamma; // \gamma(y, c) = target margin for \mu_c(x, y)
2023 
2024  //! mapping connecting constraints (dual variables) to slack variables
2025  UIntVector const& rho;
2026 
2027  //! margin coefficients
2028  QpSparseArray<QpFloatType> const& nu; // \nu(y, c, m)
2029 
2030  //! kernel modifiers
2031  QpSparseArray<QpFloatType> const& M; // M(|P|*y_i+p, y_j, q)
2032 
2033  //! indicates whether there a sum-to-zero constraint in the problem
2035 
2036  //! complexity constant; upper bound on all variables
2037  double C;
2038 
2039  //! linear part of the objective function
2040  RealVector linear;
2041 
2042  //! number of currently active examples
2043  std::size_t activeEx;
2044 
2045  //! number of currently active variables
2046  std::size_t activeVar;
2047 
2048  //! gradient of the objective function
2049  //! The gradient array is of fixed size and not subject to shrinking.
2050  RealVector gradient;
2051 
2052  //! solution candidate
2053  RealVector alpha;
2054 
2055  //! solution candidate
2056  RealVector* bias;
2057 
2058  //! should the solver use the shrinking heuristics?
2060 
2061  //! linear program for the bias solver
2062 // LP lp;
2063 };
2064 
2065 
2066 #undef GRADIENT_UPDATE
2067 #undef GAIN_SELECTION_BOX
2068 #undef GAIN_SELECTION_TRIANGLE
2069 
2070 
2071 }
2072 #endif