CSvmTrainer.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Support Vector Machine Trainer for the standard C-SVM
6  *
7  *
8  * \par
9  * This file collects trainers for the various types of support
10  * vector machines. The trainers carry the hyper-parameters of
11  * SVM training, which includes the kernel parameters.
12  *
13  *
14  *
15  *
16  * \author T. Glasmachers
17  * \date -
18  *
19  *
20  * \par Copyright 1995-2016 Shark Development Team
21  *
22  * <BR><HR>
23  * This file is part of Shark.
24  * <http://image.diku.dk/shark/>
25  *
26  * Shark is free software: you can redistribute it and/or modify
27  * it under the terms of the GNU Lesser General Public License as published
28  * by the Free Software Foundation, either version 3 of the License, or
29  * (at your option) any later version.
30  *
31  * Shark is distributed in the hope that it will be useful,
32  * but WITHOUT ANY WARRANTY; without even the implied warranty of
33  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34  * GNU Lesser General Public License for more details.
35  *
36  * You should have received a copy of the GNU Lesser General Public License
37  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
38  *
39  */
40 //===========================================================================
41 
42 
43 #ifndef SHARK_ALGORITHMS_CSVMTRAINER_H
44 #define SHARK_ALGORITHMS_CSVMTRAINER_H
45 
46 
59 
60 namespace shark {
61 
62 
63 ///
64 /// \brief Training of C-SVMs for binary classification.
65 ///
66 /// The C-SVM is the "standard" support vector machine for
67 /// binary (two-class) classification. Given are data tuples
68 /// \f$ (x_i, y_i) \f$ with x-component denoting input and
69 /// y-component denoting the label +1 or -1 (see the tutorial on
70 /// label conventions; the implementation uses values 0/1),
71 /// a kernel function k(x, x') and a regularization
72 /// constant C > 0. Let H denote the kernel induced
73 /// reproducing kernel Hilbert space of k, and let \f$ \phi \f$
74 /// denote the corresponding feature map.
75 /// Then the SVM classifier is the function
76 /// \f[
77 /// h(x) = \mathop{sign} (f(x))
78 /// \f]
79 /// \f[
80 /// f(x) = \langle w, \phi(x) \rangle + b
81 /// \f]
82 /// with coefficients w and b given by the (primal)
83 /// optimization problem
84 /// \f[
85 /// \min \frac{1}{2} \|w\|^2 + C \sum_i L(y_i, f(x_i)),
86 /// \f]
87 /// where \f$ L(y, f(x)) = \max\{0, 1 - y \cdot f(x)\} \f$
88 /// denotes the hinge loss.
89 ///
90 /// For details refer to the paper:<br/>
91 /// <p>Support-Vector Networks. Corinna Cortes and Vladimir Vapnik,
92 /// Machine Learning, vol. 20 (1995), pp. 273-297.</p>
93 /// or simply to the Wikipedia article:<br/>
94 /// http://en.wikipedia.org/wiki/Support_vector_machine
95 ///
96 template <class InputType, class CacheType = float>
98  InputType, unsigned int,
99  KernelClassifier<InputType>,
100  AbstractWeightedTrainer<KernelClassifier<InputType> >
101 >
102 {
103 private:
104  typedef AbstractSvmTrainer<
105  InputType, unsigned int,
108  > base_type;
109 public:
110 
111  /// \brief Convenience typedefs:
112  /// this and many of the below typedefs build on the class template type CacheType.
113  /// Simply changing that one template parameter CacheType thus allows to flexibly
114  /// switch between using float or double as type for caching the kernel values.
115  /// The default is float, offering sufficient accuracy in the vast majority
116  /// of cases, at a memory cost of only four bytes. However, the template
117  /// parameter makes it easy to use double instead, (e.g., in case high
118  /// accuracy training is needed).
119  typedef CacheType QpFloatType;
120 
122 
123  //! Constructor
124  //! \param kernel kernel function to use for training and prediction
125  //! \param C regularization parameter - always the 'true' value of C, even when unconstrained is set
126  //! \param offset whether to train the svm with offset term
127  //! \param unconstrained when a C-value is given via setParameter, should it be piped through the exp-function before using it in the solver?
128  //! \param computeDerivative should the derivative of b with respect to C be computed?
129  CSvmTrainer(KernelType* kernel, double C, bool offset, bool unconstrained = false, bool computeDerivative = true)
130  : base_type(kernel, C, offset, unconstrained), m_computeDerivative(computeDerivative)
131  { }
132 
133  //! Constructor
134  //! \param kernel kernel function to use for training and prediction
135  //! \param negativeC regularization parameter of the negative class (label 0)
136  //! \param positiveC regularization parameter of the positive class (label 1)
137  //! \param offset whether to train the svm with offset term
138  //! \param unconstrained when a C-value is given via setParameter, should it be piped through the exp-function before using it in the solver?
139  CSvmTrainer(KernelType* kernel, double negativeC, double positiveC, bool offset, bool unconstrained = false)
140  : base_type(kernel,negativeC, positiveC, offset, unconstrained), m_computeDerivative(false)
141  { }
142 
143  /// \brief From INameable: return the class name.
144  std::string name() const
145  { return "CSvmTrainer"; }
146 
147  /// for the rare case that there are only bounded SVs and no free SVs, this gives access to the derivative of b w.r.t. C for external use. Derivative w.r.t. C is last.
148  RealVector const& get_db_dParams() {
149  return m_db_dParams;
150  }
151 
152 
153  /// \brief Train the C-SVM.
154  void train(KernelClassifier<InputType>& svm, LabeledData<InputType, unsigned int> const& dataset)
155  {
156 
157  //prepare model
158  svm.decisionFunction().setStructure(base_type::m_kernel, dataset.inputs(),this->m_trainOffset);
159 
160  //dispatch to use the optimal implementation and solve the problem
161  trainInternal(svm.decisionFunction(),dataset);
162 
163  if (base_type::sparsify())
164  svm.decisionFunction().sparsify();
165  }
166 
167  /// \brief Train the C-SVM using weights.
168  void train(KernelClassifier<InputType>& svm, WeightedLabeledData<InputType, unsigned int> const& dataset)
169  {
170  //prepare model
171  svm.decisionFunction().setStructure(base_type::m_kernel, dataset.inputs(),this->m_trainOffset);
172 
173  //dispatch to use the optimal implementation and solve the problem
174  trainInternal(svm.decisionFunction(),dataset);
175 
176  if (base_type::sparsify())
177  svm.decisionFunction().sparsify();
178  }
179 
180 private:
181 
182  //by default the normal unoptimized kernel matrix is used
183  template<class T, class DatasetTypeT>
184  void trainInternal(KernelExpansion<T>& svm, DatasetTypeT const& dataset){
185  KernelMatrix<T, QpFloatType> km(*base_type::m_kernel, dataset.inputs());
186  trainInternal(km,svm,dataset);
187  }
188 
189  //in the case of a gaussian kernel and sparse vectors, we can use an optimized approach
190  template<class T, class DatasetTypeT>
191  void trainInternal(KernelExpansion<CompressedRealVector>& svm, DatasetTypeT const& dataset){
192  //check whether a gaussian kernel is used
194  Gaussian const* kernel = dynamic_cast<Gaussian const*> (base_type::m_kernel);
195  if(kernel != 0){//jep, use optimized kernel matrix
196  GaussianKernelMatrix<CompressedRealVector,QpFloatType> km(kernel->gamma(),dataset.inputs());
197  trainInternal(km,svm,dataset);
198  }
199  else{
201  trainInternal(km,svm,dataset);
202  }
203  }
204 
205  //create the problem for the unweighted datasets
206  template<class Matrix, class T>
207  void trainInternal(Matrix& km, KernelExpansion<T>& svm, LabeledData<T, unsigned int> const& dataset){
209  {
210  PrecomputedMatrix<Matrix> matrix(&km);
212  optimize(svm,svmProblem,dataset);
213  }
214  else
215  {
216  CachedMatrix<Matrix> matrix(&km);
218  optimize(svm,svmProblem,dataset);
219  }
220  base_type::m_accessCount = km.getAccessCount();
221  }
222 
223  // create the problem for the weighted datasets
224  template<class Matrix, class T>
225  void trainInternal(Matrix& km, KernelExpansion<T>& svm, WeightedLabeledData<T, unsigned int> const& dataset){
227  {
228  PrecomputedMatrix<Matrix> matrix(&km);
230  matrix,dataset.labels(),dataset.weights(),base_type::m_regularizers
231  );
232  optimize(svm,svmProblem,dataset.data());
233  }
234  else
235  {
236  CachedMatrix<Matrix> matrix(&km);
238  matrix,dataset.labels(),dataset.weights(),base_type::m_regularizers
239  );
240  optimize(svm,svmProblem,dataset.data());
241  }
242  base_type::m_accessCount = km.getAccessCount();
243  }
244 
245 private:
246 
247  template<class SVMProblemType>
248  void optimize(KernelExpansion<InputType>& svm, SVMProblemType& svmProblem, LabeledData<InputType, unsigned int> const& dataset){
249  if (this->m_trainOffset)
250  {
251  typedef SvmShrinkingProblem<SVMProblemType> ProblemType;
252  ProblemType problem(svmProblem,base_type::m_shrinking);
253  QpSolver< ProblemType > solver(problem);
255  column(svm.alpha(),0)= problem.getUnpermutedAlpha();
256  svm.offset(0) = computeBias(problem,dataset);
257  }
258  else
259  {
261  ProblemType problem(svmProblem,base_type::m_shrinking);
262  QpSolver< ProblemType> solver(problem);
264  column(svm.alpha(),0) = problem.getUnpermutedAlpha();
265  }
266  }
267  RealVector m_db_dParams; ///< in the rare case that there are only bounded SVs and no free SVs, this will hold the derivative of b w.r.t. the hyperparameters. Derivative w.r.t. C is last.
268 
269  bool m_computeDerivative;
270 
271  template<class Problem>
272  double computeBias(Problem const& problem, LabeledData<InputType, unsigned int> const& dataset){
273  std::size_t nkp = base_type::m_kernel->numberOfParameters();
274  m_db_dParams.resize(nkp+1);
275  m_db_dParams.clear();
276 
277  std::size_t ic = problem.dimensions();
278 
279  // compute the offset from the KKT conditions
280  double lowerBound = -1e100;
281  double upperBound = 1e100;
282  double sum = 0.0;
283  std::size_t freeVars = 0;
284  std::size_t lower_i = 0;
285  std::size_t upper_i = 0;
286  for (std::size_t i=0; i<ic; i++)
287  {
288  double value = problem.gradient(i);
289  if (problem.alpha(i) == problem.boxMin(i))
290  {
291  if (value > lowerBound) { //in case of no free SVs, we are looking for the largest gradient of all alphas at the lower bound
292  lowerBound = value;
293  lower_i = i;
294  }
295  }
296  else if (problem.alpha(i) == problem.boxMax(i))
297  {
298  if (value < upperBound) { //in case of no free SVs, we are looking for the smallest gradient of all alphas at the upper bound
299  upperBound = value;
300  upper_i = i;
301  }
302  }
303  else
304  {
305  sum += value;
306  freeVars++;
307  }
308  }
309  if (freeVars > 0)
310  return sum / freeVars; //stabilized (averaged) exact value
311 
312  if(!m_computeDerivative)
313  return 0.5 * (lowerBound + upperBound); //best estimate
314 
315  lower_i = problem.permutation(lower_i);
316  upper_i = problem.permutation(upper_i);
317 
318  SHARK_CHECK(base_type::m_regularizers.size() == 1, "derivative only implemented for SVM with one C" );
319 
320  // We next compute the derivative of lowerBound and upperBound wrt C, in order to then get that of b wrt C.
321  // The equation at the foundation of this simply is g_i = y_i - \sum_j \alpha_j K_{ij} .
322  double dlower_dC = 0.0;
323  double dupper_dC = 0.0;
324  // At the same time, we also compute the derivative of lowerBound and upperBound wrt the kernel parameters.
325  // The equation at the foundation of this simply is g_i = y_i - \sum_j \alpha_j K_{ij} .
326  RealVector dupper_dkernel( nkp,0 );
327  RealVector dlower_dkernel( nkp,0 );
328  //state for eval and evalDerivative of the kernel
329  boost::shared_ptr<State> kernelState = base_type::m_kernel->createState();
330  RealVector der(nkp ); //derivative storage helper
331  //todo: O.K.: here kernel single input derivative would be usefull
332  //also it can be usefull to use here real batch processing and use batches of size 1 for lower /upper
333  //and instead of singleInput whole batches.
334  //what we do is, that we use the batched input versions with batches of size one.
335  typename Batch<InputType>::type singleInput = Batch<InputType>::createBatch( dataset.element(0).input, 1 );
336  typename Batch<InputType>::type lowerInput = Batch<InputType>::createBatch( dataset.element(lower_i).input, 1 );
337  typename Batch<InputType>::type upperInput = Batch<InputType>::createBatch( dataset.element(upper_i).input, 1 );
338  get( lowerInput, 0 ) = dataset.element(lower_i).input; //copy the current input into the batch
339  get( upperInput, 0 ) = dataset.element(upper_i).input; //copy the current input into the batch
340  RealMatrix one(1,1,1); //weight of input
341  RealMatrix result(1,1); //stores the result of the call
342 
343  for (std::size_t i=0; i<ic; i++) {
344  double cur_alpha = problem.alpha(problem.permutation(i));
345  if ( cur_alpha != 0 ) {
346  int cur_label = ( cur_alpha>0.0 ? 1 : -1 );
347  get( singleInput, 0 ) = dataset.element(i).input; //copy the current input into the batch
348  // treat contributions of largest gradient at lower bound
349  base_type::m_kernel->eval( lowerInput, singleInput, result, *kernelState );
350  dlower_dC += cur_label * result(0,0);
351  base_type::m_kernel->weightedParameterDerivative( lowerInput, singleInput,one, *kernelState, der );
352  for ( std::size_t k=0; k<nkp; k++ ) {
353  dlower_dkernel(k) += cur_label * der(k);
354  }
355  // treat contributions of smallest gradient at upper bound
356  base_type::m_kernel->eval( upperInput, singleInput,result, *kernelState );
357  dupper_dC += cur_label * result(0,0);
358  base_type::m_kernel->weightedParameterDerivative( upperInput, singleInput, one, *kernelState, der );
359  for ( std::size_t k=0; k<nkp; k++ ) {
360  dupper_dkernel(k) += cur_label * der(k);
361  }
362  }
363  }
364  // assign final values to derivative of b wrt hyperparameters
365  m_db_dParams( nkp ) = -0.5 * ( dlower_dC + dupper_dC );
366  for ( std::size_t k=0; k<nkp; k++ ) {
367  m_db_dParams(k) = -0.5 * this->C() * ( dlower_dkernel(k) + dupper_dkernel(k) );
368  }
370  m_db_dParams( nkp ) *= this->C();
371  }
372 
373  return 0.5 * (lowerBound + upperBound); //best estimate
374  }
375 };
376 
377 
378 template <class InputType>
380 {
381 public:
383 
384  LinearCSvmTrainer(double C, bool unconstrained = false)
385  : AbstractLinearSvmTrainer<InputType>(C, unconstrained){}
386 
387  /// \brief From INameable: return the class name.
388  std::string name() const
389  { return "LinearCSvmTrainer"; }
390 
392  {
393  std::size_t dim = inputDimension(dataset);
394  QpBoxLinear<InputType> solver(dataset, dim);
395  RealMatrix w(1, dim, 0.0);
396  row(w, 0) = solver.solve(
397  base_type::C(),
398  0.0,
401  QpConfig::verbosity() > 0);
402  model.decisionFunction().setStructure(w);
403  }
404 };
405 
406 
407 template <class InputType, class CacheType = float>
408 class SquaredHingeCSvmTrainer : public AbstractSvmTrainer<InputType, unsigned int>
409 {
410 public:
411  typedef CacheType QpFloatType;
412 
416 
420 
421  //! Constructor
422  //! \param kernel kernel function to use for training and prediction
423  //! \param C regularization parameter - always the 'true' value of C, even when unconstrained is set
424  //! \param unconstrained when a C-value is given via setParameter, should it be piped through the exp-function before using it in the solver??
425  SquaredHingeCSvmTrainer(KernelType* kernel, double C, bool unconstrained = false)
426  : base_type(kernel, C, unconstrained)
427  { }
428 
429  //! Constructor
430  //! \param kernel kernel function to use for training and prediction
431  //! \param negativeC regularization parameter of the negative class (label 0)
432  //! \param positiveC regularization parameter of the positive class (label 1)
433  //! \param unconstrained when a C-value is given via setParameter, should it be piped through the exp-function before using it in the solver?
434  SquaredHingeCSvmTrainer(KernelType* kernel, double negativeC, double positiveC, bool unconstrained = false)
435  : base_type(kernel,negativeC, positiveC, unconstrained)
436  { }
437 
438  /// \brief From INameable: return the class name.
439  std::string name() const
440  { return "SquaredHingeCSvmTrainer"; }
441 
442  /// \brief Train the C-SVM.
444  {
445  svm.decisionFunction().setStructure(base_type::m_kernel,dataset.inputs(),this->m_trainOffset);
446 
447  RealVector diagonalModifier(dataset.numberOfElements(),0.5/base_type::m_regularizers(0));
448  if(base_type::m_regularizers.size() != 1){
449  for(std::size_t i = 0; i != diagonalModifier.size();++i){
450  diagonalModifier(i) = 0.5/base_type::m_regularizers(dataset.element(i).label);
451  }
452  }
453 
454  KernelMatrixType km(*base_type::m_kernel, dataset.inputs(),diagonalModifier);
456  {
457  PrecomputedMatrixType matrix(&km);
458  optimize(svm.decisionFunction(),matrix,diagonalModifier,dataset);
459  }
460  else
461  {
462  CachedMatrixType matrix(&km);
463  optimize(svm.decisionFunction(),matrix,diagonalModifier,dataset);
464  }
465  base_type::m_accessCount = km.getAccessCount();
466  if (base_type::sparsify()) svm.decisionFunction().sparsify();
467 
468  }
469 
470 private:
471 
472  template<class Matrix>
473  void optimize(KernelExpansion<InputType>& svm, Matrix& matrix,RealVector const& diagonalModifier, LabeledData<InputType, unsigned int> const& dataset){
474  typedef CSVMProblem<Matrix> SVMProblemType;
475  SVMProblemType svmProblem(matrix,dataset.labels(),1e100);
476  if (this->m_trainOffset)
477  {
478  typedef SvmShrinkingProblem<SVMProblemType> ProblemType;
479  ProblemType problem(svmProblem,base_type::m_shrinking);
480  QpSolver< ProblemType > solver(problem);
482  column(svm.alpha(),0)= problem.getUnpermutedAlpha();
483  //compute the bias
484  double sum = 0.0;
485  std::size_t freeVars = 0;
486  for (std::size_t i=0; i < problem.dimensions(); i++)
487  {
488  if(problem.alpha(i) > problem.boxMin(i) && problem.alpha(i) < problem.boxMax(i)){
489  sum += problem.gradient(i) - problem.alpha(i)*2*diagonalModifier(i);
490  freeVars++;
491  }
492  }
493  if (freeVars > 0)
494  svm.offset(0) = sum / freeVars; //stabilized (averaged) exact value
495  else
496  svm.offset(0) = 0;
497  }
498  else
499  {
501  ProblemType problem(svmProblem,base_type::m_shrinking);
502  QpSolver< ProblemType > solver(problem);
504  column(svm.alpha(),0) = problem.getUnpermutedAlpha();
505 
506  }
507  }
508 };
509 
510 
511 template <class InputType>
513 {
514 public:
516 
517  SquaredHingeLinearCSvmTrainer(double C, bool unconstrained = false)
518  : AbstractLinearSvmTrainer<InputType>(C, unconstrained){}
519 
520  /// \brief From INameable: return the class name.
521  std::string name() const
522  { return "SquaredHingeLinearCSvmTrainer"; }
523 
525  {
526  std::size_t dim = inputDimension(dataset);
527  QpBoxLinear<InputType> solver(dataset, dim);
528  RealMatrix w(1, dim, 0.0);
529  row(w, 0) = solver.solve(
530  1e100,
531  1.0 / base_type::C(),
534  QpConfig::verbosity() > 0);
535  model.decisionFunction().setStructure(w);
536  }
537 };
538 
539 
540 }
541 #endif