CSvmDerivative.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Derivative of a C-SVM hypothesis w.r.t. its hyperparameters.
6  *
7  * \par
8  * This class provides two main member functions for computing the
9  * derivative of a C-SVM hypothesis w.r.t. its hyperparameters. First,
10  * the derivative is prepared in general. Then, the derivative can be
11  * computed comparatively cheaply for any input sample. Needs to be
12  * supplied with pointers to a KernelExpansion and CSvmTrainer.
13  *
14  *
15  *
16  * \author M. Tuma, T. Glasmachers
17  * \date 2007-2012
18  *
19  *
20  * \par Copyright 1995-2015 Shark Development Team
21  *
22  * <BR><HR>
23  * This file is part of Shark.
24  * <http://image.diku.dk/shark/>
25  *
26  * Shark is free software: you can redistribute it and/or modify
27  * it under the terms of the GNU Lesser General Public License as published
28  * by the Free Software Foundation, either version 3 of the License, or
29  * (at your option) any later version.
30  *
31  * Shark is distributed in the hope that it will be useful,
32  * but WITHOUT ANY WARRANTY; without even the implied warranty of
33  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34  * GNU Lesser General Public License for more details.
35  *
36  * You should have received a copy of the GNU Lesser General Public License
37  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
38  *
39  */
40 //===========================================================================
41 
42 
43 #ifndef SHARK_MODELS_CSVMDERIVATIVE_H
44 #define SHARK_MODELS_CSVMDERIVATIVE_H
45 
46 #include <shark/Core/INameable.h>
51 
52 namespace shark {
53 
54 
55 /// \brief
56 ///
57 /// This class provides two main member functions for computing the
58 /// derivative of a C-SVM hypothesis w.r.t. its hyperparameters.
59 /// The constructor takes a pointer to a KernelClassifier and an SvmTrainer,
60 /// in the assumption that the former was trained by the latter. It heavily
61 /// accesses their members to calculate the derivative of the alpha and offset
62 /// values w.r.t. the SVM hyperparameters, that is, the regularization
63 /// parameter C and the kernel parameters. This is done in the member function
64 /// prepareCSvmParameterDerivative called by the constructor. After this initial,
65 /// heavier computation step, modelCSvmParameterDerivative can be called on an
66 /// input sample to the SVM model, and the method will yield the derivative of
67 /// the hypothesis w.r.t. the SVM hyperparameters.
68 ///
69 /// \tparam InputType Same basis type as the kernel expansion's
70 /// \tparam CacheType While the basic cache type defaults to float in the QP algorithms, it here defaults to double, because the SVM derivative benefits a lot from higher precision.
71 ///
72 template< class InputType, class CacheType = double >
73 class CSvmDerivative : public ISerializable, public INameable
74 {
75 public:
76  typedef CacheType QpFloatType;
80 
81 protected:
82 
83  // key external members through which main information is obtained
84  KernelExpansion<InputType>* mep_ke; ///< pointer to the KernelExpansion which has to have been been trained by the below SvmTrainer
85  TrainerType* mep_tr; ///< pointer to the SvmTrainer with which the above KernelExpansion has to have been trained
86  KernelType* mep_k; ///< convenience pointer to the underlying kernel function
87  RealMatrix& m_alpha; ///< convenience reference to the alpha values of the KernelExpansion
88  const Data<InputType>& m_basis; ///< convenience reference to the underlying data of the KernelExpansion
89  const RealVector& m_db_dParams_from_solver; ///< convenience access to the correction term from the solver, for the rare case that there are no free SVs
90 
91  // convenience copies from the CSvmTrainer and the underlying kernel function
92  double m_C; ///< the regularization parameter value with which the SvmTrainer trained the KernelExpansion
93  bool m_unconstrained; ///< is the unconstrained flag of the SvmTrainer set? Influences the derivatives!
94  std::size_t m_nkp; ///< convenience member holding the Number of Kernel Parameters.
95  std::size_t m_nhp; ///< convenience member holding the Number of Hyper Parameters.
96 
97  // information calculated from the KernelExpansion state in the prepareDerivative-step
98  std::size_t m_noofFreeSVs; ///< number of free SVs
99  std::size_t m_noofBoundedSVs; ///< number of bounded SVs
100  std::vector< std::size_t > m_freeAlphaIndices; ///< indices of free SVs
101  std::vector< std::size_t > m_boundedAlphaIndices; ///< indices of bounded SVs
102  RealVector m_freeAlphas; ///< free non-SV alpha values
103  RealVector m_boundedAlphas; ///< bounded non-SV alpha values
104  RealVector m_boundedLabels; ///< labels of bounded non-SVs
105 
106  /// Main member and result, computed in the prepareDerivative-step:
107  /// Stores the derivative of the **free** alphas w.r.t. SVM hyperparameters as obtained
108  /// through the CSvmTrainer (for C) and through the kernel (for the kernel parameters).
109  /// Each row corresponds to one **free** alpha, each column to one hyperparameter.
110  /// The **last** column is the derivative of (free_alphas, b) w.r.t C. All **previous**
111  /// columns are w.r.t. the kernel parameters.
112  RealMatrix m_d_alphab_d_theta;
113 
114 public:
115 
116  /// Constructor. Only sets up the main pointers and references to the external instances and data, and
117  /// performs basic sanity checks.
118  /// \param ke pointer to the KernelExpansion which has to have been been trained by the below SvmTrainer
119  /// \param trainer pointer to the SvmTrainer with which the above KernelExpansion has to have been trained
120  CSvmDerivative( KeType* ke, TrainerType* trainer )
121  : mep_ke( &ke->decisionFunction() ),
122  mep_tr( trainer ),
123  mep_k( trainer->kernel() ),
124  m_alpha( mep_ke->alpha() ),
125  m_basis( mep_ke->basis() ),
126  m_db_dParams_from_solver( trainer->get_db_dParams() ),
127  m_C ( trainer->C() ),
128  m_unconstrained( trainer->isUnconstrained() ),
129  m_nkp( trainer->kernel()->numberOfParameters() ),
130  m_nhp( trainer->kernel()->numberOfParameters()+1 )
131  {
132  SHARK_CHECK( mep_ke->kernel() == trainer->kernel(), "[CSvmDerivative::CSvmDerivative] KernelExpansion and SvmTrainer must use the same KernelFunction.");
133  SHARK_CHECK( mep_ke != NULL, "[CSvmDerivative::CSvmDerivative] KernelExpansion cannot be NULL.");
134  SHARK_CHECK( mep_ke->outputSize() == 1, "[CSvmDerivative::CSvmDerivative] only defined for binary SVMs.");
135  SHARK_CHECK( mep_ke->hasOffset() == 1, "[CSvmDerivative::CSvmDerivative] only defined for SVMs with offset.");
136  SHARK_CHECK( m_alpha.size2() == 1, "[CSvmDerivative::CSvmDerivative] this class is only defined for binary SVMs.");
137  prepareCSvmParameterDerivative(); //main
138  }
139 
140  /// \brief From INameable: return the class name.
141  std::string name() const
142  { return "CSvmDerivative"; }
143 
144  inline const KeType* ke() { return mep_ke; }
145  inline const TrainerType* trainer() { return mep_tr; }
146 
147  //! Computes the derivative of the model w.r.t. regularization and kernel parameters.
148  //! Be sure to call prepareCSvmParameterDerivative after SVM training and before calling this function!
149  //! \param input an example to be scored by the SVM
150  //! \param derivative a vector of derivatives of the score. The last entry is w.r.t. C.
151  void modelCSvmParameterDerivative(const InputType& input, RealVector& derivative )
152  {
153  // create temporary batch helpers
154  RealIdentityMatrix unit_weights(1);
155  RealMatrix bof_results(1,1);
156  typename Batch<InputType>::type bof_xi = Batch<InputType>::createBatch(input,1);
157  typename Batch<InputType>::type bof_input = Batch<InputType>::createBatch(input,1);
158  get(bof_input, 0) = input; //fixed over entire function scope
159 
160  // init helpers
161  RealVector der( m_nhp );
162  boost::shared_ptr<State> state = mep_k->createState(); //state from eval and for derivatives
163  derivative.resize( m_nhp );
164 
165  // start calculating derivative
166  noalias(derivative) = row(m_d_alphab_d_theta,m_noofFreeSVs); //without much thinking, we add db/d(\theta) to all derivatives
167  // first: go through free SVs and add their contributions (the actual ones, which use the matrix d_alphab_d_theta)
168  for ( std::size_t i=0; i<m_noofFreeSVs; i++ ) {
169  get(bof_xi, 0) = m_basis.element(m_freeAlphaIndices[i]);
170  mep_k->eval( bof_input, bof_xi, bof_results, *state );
171  double ker = bof_results(0,0);
172  double cur_alpha = m_freeAlphas(i);
173  mep_k->weightedParameterDerivative( bof_input, bof_xi, unit_weights, *state, der );
174  noalias(derivative) += ker * row(m_d_alphab_d_theta,i); //for C, simply add up the individual contributions
175  noalias(subrange(derivative,0,m_nkp))+= cur_alpha*der;
176  }
177  // second: go through all bounded SVs and add their "trivial" derivative contributions
178  for ( std::size_t i=0; i<m_noofBoundedSVs; i++ ) {
179  get(bof_xi, 0) = m_basis.element(m_boundedAlphaIndices[i]);
180  mep_k->eval( bof_input, bof_xi, bof_results, *state );
181  double ker = bof_results(0,0);
182  double cur_label = m_boundedLabels(i);
183  mep_k->weightedParameterDerivative( bof_input, bof_xi, unit_weights, *state, der );
184  derivative( m_nkp ) += ker * cur_label; //deriv of bounded alpha w.r.t. C is simply the label
185  noalias(subrange(derivative,0,m_nkp))+= cur_label * m_C * der;
186  }
187  if ( m_unconstrained )
188  derivative( m_nkp ) *= m_C; //compensate for log encoding via chain rule
189  //(the kernel parameter derivatives are correctly differentiated according to their
190  // respective encoding via the kernel's derivative, so we don't need to correct for those.)
191 
192  // in some rare cases, there are no free SVs and we have to manually correct the derivatives using a correcting term from the SvmTrainer.
193  if ( m_noofFreeSVs == 0 ) {
194  noalias(derivative) += m_db_dParams_from_solver;
195  }
196  }
197 
198  //! Whether there are free SVs in the solution. Useful to monitor for degenerate solutions
199  //! with only bounded and no free SVs. Be sure to call prepareCSvmParameterDerivative after
200  //! SVM training and before calling this function.
201  bool hasFreeSVs() { return ( m_noofFreeSVs != 0 ); }
202  //! Whether there are bounded SVs in the solution. Useful to monitor for degenerate solutions
203  //! with only bounded and no free SVs. Be sure to call prepareCSvmParameterDerivative after
204  //! SVM training and before calling this function.
205  bool hasBoundedSVs() { return ( m_noofBoundedSVs != 0 ); }
206 
207  /// Access to the matrix of SVM coefficients' derivatives. Derivative w.r.t. C is last.
208  const RealMatrix& get_dalphab_dtheta() {
209  return m_d_alphab_d_theta;
210  }
211 
212  //\todo //mtq
213  /// From ISerializable, reads a network from an archive
214  virtual void read( InArchive & archive ) {
215  throw SHARKEXCEPTION("[CSvmDerivative::read] Not implemented yet.");
216  }
217  //\todo //mtq
218  /// From ISerializable, writes a network to an archive
219  virtual void write( OutArchive & archive ) const {
220  throw SHARKEXCEPTION("[CSvmDerivative::write] Not implemented yet.");
221  }
222 
223 private:
224 
225  /////////// DERIVATIVE OF BINARY (C-)SVM ////////////////////
226 
227  //! Fill m_d_alphab_d_theta, the matrix of derivatives of free SVs w.r.t. C-SVM hyperparameters
228  //! as obtained through the CSvmTrainer (for C) and through the kernel (for the kernel parameters).
229  //! \par
230  //! Note: we follow the alpha-encoding-conventions of Glasmacher's dissertation, where the alpha values
231  //! are re-encoded by multiplying each with the corresponding label
232  //!
233  void prepareCSvmParameterDerivative() {
234  // init convenience size indicators
235  std::size_t numberOfAlphas = m_alpha.size1();
236 
237  // first round through alphas: count free and bounded SVs
238  for ( std::size_t i=0; i<numberOfAlphas; i++ ) {
239  double cur_alpha = m_alpha(i,0); //we assume (and checked) that there is only one class
240  if ( cur_alpha != 0.0 ) {
241  if ( cur_alpha == m_C || cur_alpha == -m_C ) { //the svm formulation using reparametrized alphas is assumed
242  m_boundedAlphaIndices.push_back(i);
243  } else {
244  m_freeAlphaIndices.push_back(i);
245  }
246  }
247  }
248  m_noofFreeSVs = m_freeAlphaIndices.size(); //don't forget to add b to the count where appropriate
249  m_noofBoundedSVs = m_boundedAlphaIndices.size();
250  // in contrast to the Shark2 implementation, we here don't store useless constants (i.e., 0, 1, -1), but only the derivs w.r.t. (\alpha_free, b)
251  m_d_alphab_d_theta.resize(m_noofFreeSVs+1, m_nhp);
252  m_d_alphab_d_theta.clear();
253  m_freeAlphaIndices.push_back( numberOfAlphas ); //b is always free (but don't forget to add to count manually)
254 
255  // 2nd round through alphas: build up the RealVector of free and bounded alphas (needed for matrix-vector-products later)
256  m_freeAlphas.resize( m_noofFreeSVs+1);
257  m_boundedAlphas.resize( m_noofBoundedSVs );
258  m_boundedLabels.resize( m_noofBoundedSVs );
259  for ( std::size_t i=0; i<m_noofFreeSVs; i++ )
260  m_freeAlphas(i) = m_alpha( m_freeAlphaIndices[i], 0 );
261  m_freeAlphas( m_noofFreeSVs ) = mep_ke->offset(0);
262  for ( std::size_t i=0; i<m_noofBoundedSVs; i++ ) {
263  double cur_alpha = m_alpha( m_boundedAlphaIndices[i], 0 );
264  m_boundedAlphas(i) = cur_alpha;
265  m_boundedLabels(i) = ( (cur_alpha > 0.0) ? 1.0 : -1.0 );
266  }
267 
268  //if there are no free support vectors, we are done.
269  if ( m_noofFreeSVs == 0 ) {
270  return;
271  }
272 
273  // set up helper variables.
274  // See Tobias Glasmacher's dissertation, chapter 9.3, for a calculation of the derivatives as well as
275  // for a definition of these variables. -> It's very easy to follow this code with that chapter open.
276  // The Keerthi-paper "Efficient method for gradient-based..." is also highly recommended for cross-reference.
277  RealVector der( m_nkp ); //derivative storage helper
278  boost::shared_ptr<State> state = mep_k->createState(); //state object for derivatives
279 
280  // create temporary batch helpers
281  RealIdentityMatrix unit_weights(1);
282  RealMatrix bof_results(1,1);
283  typename Batch<InputType>::type bof_xi;
284  typename Batch<InputType>::type bof_xj;
285  if ( m_noofFreeSVs != 0 ) {
286  bof_xi = Batch<InputType>::createBatch( m_basis.element(m_freeAlphaIndices[0]), 1 ); //any input works
287  bof_xj = Batch<InputType>::createBatch( m_basis.element(m_freeAlphaIndices[0]), 1 ); //any input works
288  } else if ( m_noofBoundedSVs != 0 ) {
289  bof_xi = Batch<InputType>::createBatch( m_basis.element(m_boundedAlphaIndices[0]), 1 ); //any input works
290  bof_xj = Batch<InputType>::createBatch( m_basis.element(m_boundedAlphaIndices[0]), 1 ); //any input works
291  } else {
292  throw SHARKEXCEPTION("[CSvmDerivative::prepareCSvmParameterDerivative] Something went very wrong.");
293  }
294 
295 
296  // initialize H and dH
297  RealMatrix H( m_noofFreeSVs+1, m_noofFreeSVs+1,0.0 );
298  std::vector< RealMatrix > dH( m_nkp , RealMatrix(m_noofFreeSVs+1, m_noofFreeSVs+1));
299  for ( std::size_t i=0; i<m_noofFreeSVs; i++ ) {
300  get(bof_xi, 0) = m_basis.element(m_freeAlphaIndices[i]); //fixed over outer loop
301  // fill the off-diagonal entries..
302  for ( std::size_t j=0; j<i; j++ ) {
303  get(bof_xj, 0) = m_basis.element(m_freeAlphaIndices[j]); //get second sample into a batch
304  mep_k->eval( bof_xi, bof_xj, bof_results, *state );
305  H( i,j ) = H( j,i ) = bof_results(0,0);
306  mep_k->weightedParameterDerivative( bof_xi, bof_xj, unit_weights, *state, der );
307  for ( std::size_t k=0; k<m_nkp; k++ ) {
308  dH[k]( i,j ) = dH[k]( j,i ) = der(k);
309  }
310  }
311  // ..then fill the diagonal entries..
312  mep_k->eval( bof_xi, bof_xi, bof_results, *state );
313  H( i,i ) = bof_results(0,0);
314  mep_k->weightedParameterDerivative( bof_xi, bof_xi, unit_weights, *state, der );
315  for ( std::size_t k=0; k<m_nkp; k++ ) {
316  dH[k]( i,i ) = der(k);
317  }
318  // ..and finally the last row/column (pertaining to the offset parameter b)..
319  H( m_noofFreeSVs, i ) = H( i, m_noofFreeSVs ) = 1.0;
320  for (std::size_t k=0; k<m_nkp; k++)
321  dH[k]( m_noofFreeSVs, i ) = dH[k]( i, m_noofFreeSVs ) = 0.0;
322  }
323 
324  // ..the lower-right-most entry gets set separately:
325  H( m_noofFreeSVs, m_noofFreeSVs ) = 0.0;
326  for ( std::size_t k=0; k<m_nkp; k++ ) {
327  dH[k]( m_noofFreeSVs, m_noofFreeSVs ) = 0.0;
328  }
329 
330  // initialize R and dR
331  RealMatrix R( m_noofFreeSVs+1, m_noofBoundedSVs );
332  std::vector< RealMatrix > dR( m_nkp, RealMatrix(m_noofFreeSVs+1, m_noofBoundedSVs));
333  for ( std::size_t i=0; i<m_noofBoundedSVs; i++ ) {
334  get(bof_xi, 0) = m_basis.element(m_boundedAlphaIndices[i]); //fixed over outer loop
335  for ( std::size_t j=0; j<m_noofFreeSVs; j++ ) { //this time, we (have to) do it row by row
336  get(bof_xj, 0) = m_basis.element(m_freeAlphaIndices[j]); //get second sample into a batch
337  mep_k->eval( bof_xi, bof_xj, bof_results, *state );
338  R( j,i ) = bof_results(0,0);
339  mep_k->weightedParameterDerivative( bof_xi, bof_xj, unit_weights, *state, der );
340  for ( std::size_t k=0; k<m_nkp; k++ )
341  dR[k]( j,i ) = der(k);
342  }
343  R( m_noofFreeSVs, i ) = 1.0; //last row is for b
344  for ( std::size_t k=0; k<m_nkp; k++ )
345  dR[k]( m_noofFreeSVs, i ) = 0.0;
346  }
347 
348 
349  //O.K.: A big step of the computation of the derivative m_d_alphab_d_theta is
350  // the multiplication with H^{-1} B. (where B are the other terms).
351  // However instead of storing m_d_alphab_d_theta_i = -H^{-1}*b_i
352  //we store _i and compute the multiplication with the inverse
353  //afterwards by solving the system Hx_i = b_i
354  //for i = 1....m_nkp+1
355  //this is a lot faster and numerically more stable.
356 
357  // compute the derivative of (\alpha, b) w.r.t. C
358  if ( m_noofBoundedSVs > 0 ) {
359  noalias(column(m_d_alphab_d_theta,m_nkp)) = prod( R, m_boundedLabels);
360  }
361  // compute the derivative of (\alpha, b) w.r.t. the kernel parameters
362  for ( std::size_t k=0; k<m_nkp; k++ ) {
363  RealVector sum = prod( dH[k], m_freeAlphas ); //sum = dH * \alpha_f
364  if(m_noofBoundedSVs > 0)
365  noalias(sum) += prod( dR[k], m_boundedAlphas ); // sum += dR * \alpha_r , i.e., the C*y_g is expressed as alpha_g
366  //fill the remaining columns of the derivative matrix (except the last, which is for C)
367  noalias(column(m_d_alphab_d_theta,k)) = sum;
368  }
369 
370  //lastly solve the system Hx_i = b_i
371  // MAJOR STEP: this is the achilles heel of the current implementation, cf. keerthi 2007
372  // TODO: mtq: explore ways for speed-up..
373  blas::generalSolveSystemInPlace<blas::SolveAXB>(H,m_d_alphab_d_theta);
374  m_d_alphab_d_theta*=-1;
375 
376  // that's all, folks; we're done.
377  }
378 
379 };//class
380 
381 
382 }//namespace
383 #endif