SvmLogisticInterpretation.h
Go to the documentation of this file.
1 /*!
2  *
3  *
4  * \brief Maximum-likelihood model selection for binary support vector machines.
5  *
6  *
7  *
8  * \author M.Tuma, T.Glasmachers
9  * \date 2009-2012
10  *
11  *
12  * \par Copyright 1995-2015 Shark Development Team
13  *
14  * <BR><HR>
15  * This file is part of Shark.
16  * <http://image.diku.dk/shark/>
17  *
18  * Shark is free software: you can redistribute it and/or modify
19  * it under the terms of the GNU Lesser General Public License as published
20  * by the Free Software Foundation, either version 3 of the License, or
21  * (at your option) any later version.
22  *
23  * Shark is distributed in the hope that it will be useful,
24  * but WITHOUT ANY WARRANTY; without even the implied warranty of
25  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26  * GNU Lesser General Public License for more details.
27  *
28  * You should have received a copy of the GNU Lesser General Public License
29  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
30  *
31  */
32 #ifndef SHARK_ML_SVMLOGISTICINTERPRETATION_H
33 #define SHARK_ML_SVMLOGISTICINTERPRETATION_H
34 
41 #include <boost/math/special_functions/log1p.hpp>
42 
43 namespace shark {
44 
45 ///
46 /// \brief Maximum-likelihood model selection score for binary support vector machines
47 ///
48 /// \par
49 /// This class implements the maximum-likelihood based SVM model selection
50 /// procedure presented in the article "Glasmachers and C. Igel. Maximum
51 /// Likelihood Model Selection for 1-Norm Soft Margin SVMs with Multiple
52 /// Parameters. IEEE Transactions on Pattern Analysis and Machine Intelligence, 2010."
53 /// At this point, only binary C-SVMs are supported.
54 /// \par
55 /// This class implements an AbstactObjectiveFunction. In detail, it provides
56 /// a differentiable measure of how well a C-SVM with given hyperparameters fulfills
57 /// the maximum-likelihood score presented in the paper. This error measure can then
58 /// be optimized for externally via gradient-based optimizers. In other words, this
59 /// class provides a score, not an optimization method or a training algorithm. The
60 /// C-SVM parameters have to be optimized with regard to this measure
61 ///
62 template<class InputType = RealVector>
64 public:
67 protected:
68  FoldsType m_folds; ///< the underlying partitioned dataset.
69  KernelType *mep_kernel; ///< the kernel with which to run the SVM
70  std::size_t m_nhp; ///< for convenience, the Number of Hyper Parameters
71  std::size_t m_nkp; ///< for convenience, the Number of Kernel Parameters
72  std::size_t m_numFolds; ///< the number of folds to be used in cross-validation
73  std::size_t m_numSamples; ///< overall number of samples in the dataset
74  std::size_t m_inputDims; ///< input dimensionality
75  bool m_svmCIsUnconstrained; ///< the SVM regularization parameter C is passed for unconstrained optimization, and the derivative should compensate for that
76  QpStoppingCondition *mep_svmStoppingCondition; ///< the stopping criterion that is to be passed to the SVM trainer.
77  bool m_sigmoidSlopeIsUnconstrained; ///< whether or not to use the unconstrained variant of the sigmoid. currently always true, not user-settable, existing for safety.
78 
79 public:
80 
81  //! constructor.
82  //! \param folds an already partitioned dataset (i.e., a CVFolds object)
83  //! \param kernel pointer to the kernel to be used within the SVMs.
84  //! \param unconstrained whether or not the C-parameter of/for the C-SVM is passed for unconstrained optimization mode.
85  //! \param stop_cond the stopping conditions which are to be passed to the
87  FoldsType const &folds, KernelType *kernel,
88  bool unconstrained = true, QpStoppingCondition *stop_cond = NULL
89  )
90  : mep_kernel(kernel)
91  , m_nhp(kernel->parameterVector().size()+1)
92  , m_nkp(kernel->parameterVector().size())
93  , m_numFolds(folds.size()) //gets number of folds!
94  , m_numSamples(folds.dataset().numberOfElements())
95  , m_inputDims(inputDimension(folds.dataset()))
96  , m_svmCIsUnconstrained(unconstrained)
97  , mep_svmStoppingCondition(stop_cond)
98  , m_sigmoidSlopeIsUnconstrained(true)
99  {
100  SHARK_CHECK(kernel != NULL, "[SvmLogisticInterpretation::SvmLogisticInterpretation] kernel is not allowed to be NULL"); //mtq: necessary despite indirect check via call in initialization list?
101  SHARK_CHECK(m_numFolds > 1, "[SvmLogisticInterpretation::SvmLogisticInterpretation] please provide a meaningful number of folds for cross validation");
102  if (!m_svmCIsUnconstrained) //mtq: important: we additionally need to deal with kernel feasibility indicators! important!
105  if (mep_kernel->hasFirstParameterDerivative())
107  m_folds = folds;
108  }
109 
110  /// \brief From INameable: return the class name.
111  std::string name() const
112  { return "SvmLogisticInterpretation"; }
113 
114  //! checks whether the search point provided is feasible
115  //! \param input the point to test for feasibility
116  bool isFeasible(const SearchPointType &input) const {
117  SHARK_ASSERT(input.size() == m_nhp);
118  //throw SHARKEXCEPTION("[SvmLogisticInterpretation::isFeasible] Please first clarify how the kernel parameter feasibility should be dealt with. Afterwards, please write a test for this method. Thanks.");
119  if (input(0) <= 0.0 && !m_svmCIsUnconstrained) {
120  return false;
121  }
122  return true;
123  }
124 
125  std::size_t numberOfVariables()const{
126  return m_nhp;
127  }
128 
129  //! train a number of SVMs in a cross-validation setting using the hyperparameters passed to this method.
130  //! the output scores from all validations sets are then concatenated. together with the true labels, these
131  //! scores can then be used to fit a sigmoid such that it becomes as good as possible a model for the
132  //! class membership probabilities given the SVM output scores. This method returns the negative likelihood
133  //! of the best fitting sigmoid, given a set of SVM hyperparameters.
134  //! \param parameters the SVM hyperparameters to use for all C-SVMs
135  double eval(SearchPointType const &parameters) const {
136  SHARK_CHECK(m_nhp == parameters.size(), "[SvmLogisticInterpretation::eval] wrong number of parameters");
137  // initialize, copy parameters
138  double C_reg = (m_svmCIsUnconstrained ? std::exp(parameters(m_nkp)) : parameters(m_nkp)); //set up regularization parameter
139  mep_kernel->setParameterVector(subrange(parameters, 0, m_nkp)); //set up kernel parameters
140  // these two will be filled in order corresp. to all CV validation partitions stacked
141  // behind one another, and then used to create datasets with
142  std::vector< unsigned int > tmp_helper_labels(m_numSamples);
143  std::vector< RealVector > tmp_helper_preds(m_numSamples);
144  unsigned int next_label = 0; //helper index counter to monitor the next position to be filled in the above vectors
145 
146  // for each fold, train an svm and get predictions on the validation data
147  for (std::size_t i=0; i<m_numFolds; i++) {
148  // get current train/validation partitions as well as corresponding labels
149  ClassificationDataset cur_train_data = m_folds.training(i);
150  ClassificationDataset cur_valid_data = m_folds.validation(i);
151  std::size_t cur_vsize = cur_valid_data.numberOfElements();
152  Data< unsigned int > cur_vlabels = cur_valid_data.labels(); //validation labels of this fold
153  Data< RealVector > cur_vscores; //will hold SVM output scores for current validation partition
154  // init SVM
156  CSvmTrainer<InputType, double> csvm_trainer(mep_kernel, C_reg, true, m_svmCIsUnconstrained); //the trainer
157  csvm_trainer.sparsify() = false;
158  if (mep_svmStoppingCondition != NULL) {
160  } else {
161  csvm_trainer.stoppingCondition().minAccuracy = 1e-3; //mtq: is this necessary? i think it could be set via long chain of default ctors..
162  csvm_trainer.stoppingCondition().maxIterations = 200 * m_inputDims; //mtq: need good/better heuristics to determine a good value for this
163  }
164 
165  // train SVM on current fold
166  csvm_trainer.train(svm, cur_train_data);
167  cur_vscores = svm.decisionFunction()(cur_valid_data.inputs()); //will result in a dataset of RealVector as output
168  // copy the scores and corresponding labels to the dataset-wide storage
169  for (std::size_t j=0; j<cur_vsize; j++) {
170  tmp_helper_labels[next_label] = cur_vlabels.element(j);
171  tmp_helper_preds[next_label] = cur_vscores.element(j);
172  ++next_label;
173  }
174  }
175  Data< unsigned int > all_validation_labels = createDataFromRange(tmp_helper_labels);
176  Data< RealVector > all_validation_predictions = createDataFromRange(tmp_helper_preds);
177 
178  // now we got it all: the predictions across the validation folds, plus the correct corresponding
179  // labels. so we go ahead and fit a sigmoid to be as good as possible a model between the two:
180  SigmoidModel sigmoid_model(m_sigmoidSlopeIsUnconstrained); //use the unconstrained variant?
181  SigmoidFitRpropNLL sigmoid_trainer(100); //number of rprop iterations
182  ClassificationDataset validation_dataset(all_validation_predictions, all_validation_labels);
183  sigmoid_trainer.train(sigmoid_model, validation_dataset);
184  // we're basically done. now only get the final cost value of the best fit, and return it:
185  Data< RealVector > sigmoid_predictions = sigmoid_model(all_validation_predictions);
186 
187  double error = 0;
188  for (std::size_t i=0; i<m_numSamples; i++) {
189  double p = sigmoid_predictions.element(i)(0);
190  if (all_validation_labels.element(i) == 1){ //positive class
191  error -= std::log(p);
192  }
193  else{ //negative class
194  error -= boost::math::log1p(-p);
195  }
196  }
197 
198  return error/m_numSamples;
199  }
200 
201  //! the derivative of the error() function above w.r.t. the parameters.
202  //! \param parameters the SVM hyperparameters to use for all C-SVMs
203  //! \param derivative will store the computed derivative w.r.t. the current hyperparameters
204  // mtq: should this also follow the first-call-error()-then-call-deriv() paradigm?
205  double evalDerivative(SearchPointType const &parameters, FirstOrderDerivative &derivative) const {
206  SHARK_CHECK(m_nhp == parameters.size(), "[SvmLogisticInterpretation::evalDerivative] wrong number of parameters");
207  // initialize, copy parameters
208  double C_reg = (m_svmCIsUnconstrained ? std::exp(parameters(m_nkp)) : parameters(m_nkp)); //set up regularization parameter
209  mep_kernel->setParameterVector(subrange(parameters, 0, m_nkp)); //set up kernel parameters
210  // these two will be filled in order corresp. to all CV validation partitions stacked
211  // behind one another, and then used to create datasets with
212  std::vector< unsigned int > tmp_helper_labels(m_numSamples);
213  std::vector< RealVector > tmp_helper_preds(m_numSamples);
214 
215  unsigned int next_label = 0; //helper index counter to monitor the next position to be filled in the above vectors
216  // init variables especially for derivative
217  RealMatrix all_validation_predict_derivs(m_numSamples, m_nhp); //will hold derivatives of all output scores w.r.t. all hyperparameters
218  RealVector der; //temporary helper for derivative calls
219 
220  // for each fold, train an svm and get predictions on the validation data
221  for (std::size_t i=0; i<m_numFolds; i++) {
222  // get current train/validation partitions as well as corresponding labels
223  ClassificationDataset cur_train_data = m_folds.training(i);
224  ClassificationDataset cur_valid_data = m_folds.validation(i);
225  std::size_t cur_vsize = cur_valid_data.numberOfElements();
226  Data< unsigned int > cur_vlabels = cur_valid_data.labels(); //validation labels of this fold
227  Data< RealVector > cur_vinputs = cur_valid_data.inputs(); //validation inputs of this fold
228  Data< RealVector > cur_vscores; //will hold SVM output scores for current validation partition
229  // init SVM
230  KernelClassifier<InputType> svm; //the SVM
231  CSvmTrainer<InputType, double> csvm_trainer(mep_kernel, C_reg, true, m_svmCIsUnconstrained); //the trainer
232  csvm_trainer.sparsify() = false;
233  if (mep_svmStoppingCondition != NULL) {
235  } else {
236  csvm_trainer.stoppingCondition().maxIterations = 200 * m_inputDims; //mtq: need good/better heuristics to determine a good value for this
237  }
238  // train SVM on current fold
239  csvm_trainer.train(svm, cur_train_data);
240  CSvmDerivative<InputType> svm_deriv(&svm, &csvm_trainer);
241  cur_vscores = svm.decisionFunction()(cur_valid_data.inputs()); //will result in a dataset of RealVector as output
242  // copy the scores and corresponding labels to the dataset-wide storage
243  for (std::size_t j=0; j<cur_vsize; j++) {
244  // copy label and prediction score
245  tmp_helper_labels[next_label] = cur_vlabels.element(j);
246  tmp_helper_preds[next_label] = cur_vscores.element(j);
247  // get and store the derivative of the score w.r.t. the hyperparameters
248  svm_deriv.modelCSvmParameterDerivative(cur_vinputs.element(j), der);
249  noalias(row(all_validation_predict_derivs, next_label)) = der; //fast assignment of the derivative to the correct matrix row
250  ++next_label;
251  }
252  }
253  Data< unsigned int > all_validation_labels = createDataFromRange(tmp_helper_labels);
254  Data< RealVector > all_validation_predictions = createDataFromRange(tmp_helper_preds);
255 
256  // now we got it all: the predictions across the validation folds, plus the correct corresponding
257  // labels. so we go ahead and fit a sigmoid to be as good as possible a model between the two:
258  SigmoidModel sigmoid_model(m_sigmoidSlopeIsUnconstrained); //use the unconstrained variant?
259  SigmoidFitRpropNLL sigmoid_trainer(100); //number of rprop iterations
260  ClassificationDataset validation_dataset(all_validation_predictions, all_validation_labels);
261  sigmoid_trainer.train(sigmoid_model, validation_dataset);
262  // we're basically done. now only get the final cost value of the best fit, and return it:
263  Data< RealVector > sigmoid_predictions = sigmoid_model(all_validation_predictions);
264 
265  // finally compute the derivative of the sigmoid model predictions:
266  // (we're here a bit un-shark-ish in that we do some of the derivative calculations by hand where they
267  // would also and more consistently be offered by their respective classes. one reason we're doing it
268  // like this might be the missing batch processing for the evalDerivatives)
269  derivative.resize(m_nhp);
270  derivative.clear();
271 
272  double ss = (m_sigmoidSlopeIsUnconstrained ? std::exp(sigmoid_model.parameterVector()(0)) : sigmoid_model.parameterVector()(0));
273  double error = 0;
274  for (std::size_t i=0; i<m_numSamples; i++) {
275  double p = sigmoid_predictions.element(i)(0);
276  // compute derivative of the negative log likelihood
277  double dL_dsp; //derivative of likelihood wrt sigmoid predictions
278  if (all_validation_labels.element(i) == 1){ //positive class
279  error -= std::log(p);
280  dL_dsp = -1.0/p;
281  }
282  else{ //negative class
283  error -= boost::math::log1p(-p);
284  dL_dsp = 1.0/(1.0-p);
285  }
286  // compute derivative of the sigmoid
287  // derivative of sigmoid predictions wrt svm predictions
288  double dsp_dsvmp = ss * p * (1.0-p); //severe sign confusion potential: p(1-p) is deriv. w.r.t. t in 1/(1+e**(-t))!
289  for (std::size_t j=0; j<m_nhp; j++) {
290  derivative(j) += dL_dsp * dsp_dsvmp * all_validation_predict_derivs(i,j);
291  }
292  }
293  derivative /= m_numSamples;
294  return error / m_numSamples;
295  }
296 };
297 
298 
299 }
300 #endif