LassoRegression.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief LASSO Regression
6  *
7  *
8  *
9  * \author T. Glasmachers
10  * \date 2013
11  *
12  *
13  * \par Copyright 1995-2015 Shark Development Team
14  *
15  * <BR><HR>
16  * This file is part of Shark.
17  * <http://image.diku.dk/shark/>
18  *
19  * Shark is free software: you can redistribute it and/or modify
20  * it under the terms of the GNU Lesser General Public License as published
21  * by the Free Software Foundation, either version 3 of the License, or
22  * (at your option) any later version.
23  *
24  * Shark is distributed in the hope that it will be useful,
25  * but WITHOUT ANY WARRANTY; without even the implied warranty of
26  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27  * GNU Lesser General Public License for more details.
28  *
29  * You should have received a copy of the GNU Lesser General Public License
30  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
31  *
32  */
33 //===========================================================================
34 
35 
36 #ifndef SHARK_ALGORITHMS_TRAINERS_LASSOREGRESSION_H
37 #define SHARK_ALGORITHMS_TRAINERS_LASSOREGRESSION_H
38 
41 #include <cmath>
42 
43 
44 namespace shark {
45 
46 
47 /*!
48  * \brief LASSO Regression
49  *
50  * LASSO Regression extracts a sparse vector of regression
51  * coefficients. The original method amounts to L1-constrained
52  * least squares regression, while this implementation uses an
53  * L1 penalty instead of a constraint (which is equivalent).
54  *
55  * For data vectors \f$ x_i \f$ with real-valued labels \f$ y_i \f$
56  * the trainer solves the problem
57  * \f$ \min_w \quad \frac{1}{2} \sum_i (w^T x_i - y_i)^2 + \lambda \|w\|_1 \f$.
58  * The target accuracy of the solution is measured in terms of the
59  * smallest component (L1 norm) of the gradient of the objective
60  * function.
61  *
62  * The trainer has one template parameter, namely the type of
63  * the input vectors \f$ x_i \f$. These need to be vector valued,
64  * typically either RealVector of CompressedRealVector. The
65  * resulting weight vector w is represented by a LinearModel
66  * object. Currently model outputs and labels are restricted to a
67  * single dimension.
68  */
69 template <class InputVectorType = RealVector>
70 class LassoRegression : public AbstractTrainer<LinearModel<InputVectorType> >, public IParameterizable
71 {
72 public:
75 
76  /// \brief Constructor.
77  ///
78  /// \param _lambda value of the regularization parameter (see class description)
79  /// \param _accuracy stopping criterion for the iterative solver, maximal gradient component of the objective function (see class description)
80  LassoRegression(double _lambda, double _accuracy = 0.01)
81  : m_lambda(_lambda)
82  , m_accuracy(_accuracy)
83  {
84  RANGE_CHECK(m_lambda >= 0.0);
85  RANGE_CHECK(m_accuracy > 0.0);
86  }
87 
88  /// \brief From INameable: return the class name.
89  std::string name() const
90  { return "LASSO regression"; }
91 
92 
93  /// \brief Return the current setting of the regularization parameter.
94  double lambda() const
95  {
96  return m_lambda;
97  }
98 
99  /// \brief Set the regularization parameter.
100  void setLambda(double lambda)
101  {
102  RANGE_CHECK(lambda >= 0.0);
103  m_lambda = lambda;
104  }
105 
106  /// \brief Return the current setting of the accuracy (maximal gradient component of the optimization problem).
107  double accuracy() const
108  {
109  return m_accuracy;
110  }
111 
112  /// \brief Set the accuracy (maximal gradient component of the optimization problem).
113  void setAccuracy(double _accuracy)
114  {
115  RANGE_CHECK(_accuracy > 0.0);
116  m_accuracy = _accuracy;
117  }
118 
119  /// \brief Get the regularization parameter lambda through the IParameterizable interface.
120  RealVector parameterVector() const
121  {
122  return RealVector(1, m_lambda);
123  }
124 
125  /// \brief Set the regularization parameter lambda through the IParameterizable interface.
126  void setParameterVector(const RealVector& param)
127  {
128  SIZE_CHECK(param.size() == 1);
129  RANGE_CHECK(param(0) >= 0.0);
130  m_lambda = param(0);
131  }
132 
133  /// \brief Return the number of parameters (one in this case).
134  size_t numberOfParameters() const
135  {
136  return 1;
137  }
138 
139  /// \brief Train a linear model with LASSO regression.
140  void train(ModelType& model, DataType const& dataset)
141  {
142  SIZE_CHECK(model.outputSize() == 1);
143 
144  dim = inputDimension(dataset);
145  RealVector alpha(dim, 0.0);
146  trainInternal(alpha, dataset);
147 
148  RealMatrix mat(1, dim);
149  row(mat, 0) = alpha;
150  model.setStructure(mat);
151  }
152 
153 protected:
154 
155  /// \brief Actual training procedure.
156  void trainInternal(RealVector& alpha, DataType const& dataset)
157  {
158  // strategy constants
159  const double CHANGE_RATE = 0.2;
160  const double PREF_MIN = 0.05;
161  const double PREF_MAX = 20.0;
162 
163  // console output
164  const bool verbose = false;
165 
166  //transpose the dataset and push it inside a single matrix
167  data = trans(createBatch(dataset.inputs().elements()));
168  label = column(createBatch(dataset.labels().elements()),0);
169 
170  RealVector diag(dim);
171  RealVector w = label;
172  UIntVector index(dim);
173 
174  // pre-calculate diagonal matrix entries (feature-wise squared norms)
175  for (size_t i=0; i<dim; i++){
176  diag[i] = norm_sqr(row(data,i));
177  }
178 
179  // prepare preferences for scheduling
180  RealVector pref(dim,1.0);
181  double prefsum = (double)dim;
182 
183 
184  // prepare performance monitoring for self-adaptation
185  const double gain_learning_rate = 1.0 / dim;
186  double average_gain = 0.0;
187  int canstop = 1;
188  const double lambda = m_lambda;
189 
190  // main optimization loop
191  std::size_t iter = 0;
192  std::size_t steps = 0;
193  while (true)
194  {
195  double maxvio = 0.0;
196 
197  // define schedule
198  double psum = prefsum;
199  prefsum = 0.0;
200  int pos = 0;
201 
202  for (std::size_t i=0; i<dim; i++)
203  {
204  double p = pref[i];
205  double n;
206  if (psum >= 1e-6 && p < psum)
207  n = (dim - pos) * p / psum;
208  else
209  n = (dim - pos); // for numerical stability
210 
211  unsigned int m = (unsigned int)floor(n);
212  double prob = n - m;
213  if ((double)rand() / (double)RAND_MAX < prob) m++;
214  for (std::size_t j=0; j<m; j++)
215  {
216  index[pos] = (unsigned int)i;
217  pos++;
218  }
219  psum -= p;
220  prefsum += p;
221  }
222  for (std::size_t i=0; i<dim; i++)
223  {
224  std::size_t r = rand() % dim;
225  std::swap(index[r], index[i]);
226  }
227 
228  steps += dim;
229  for (size_t s=0; s<dim; s++)
230  {
231  std::size_t i = index[s];
232  double a = alpha[i];
233  double d = diag[i];
234 
235  // compute "gradient component" <w, X_i>
236  double grad = inner_prod(w,row(data,i));
237 
238  // compute optimal coordinate descent step and corresponding gain
239  double vio = 0.0;
240  double gain = 0.0;
241  double delta = 0.0;
242  if (a == 0.0)
243  {
244  if (grad > lambda)
245  {
246  vio = grad - lambda;
247  delta = -vio / d;
248  gain = 0.5 * d * delta * delta;
249  }
250  else if (grad < -lambda)
251  {
252  vio = -grad - lambda;
253  delta = vio / d;
254  gain = 0.5 * d * delta * delta;
255  }
256  }
257  else if (a > 0.0)
258  {
259  grad += lambda;
260  vio = std::fabs(grad);
261  delta = -grad / d;
262  if (delta < -a)
263  {
264  delta = -a;
265  gain = delta * (grad - 0.5 * d * delta);
266  double g0 = grad - a * d - 2.0 * lambda;
267  if (g0 > 0.0)
268  {
269  double dd = -g0 / d;
270  gain = dd * (grad - 0.5 * d * dd);
271  delta += dd;
272  }
273  }
274  else gain = 0.5 * d * delta * delta;
275  }
276  else
277  {
278  grad -= lambda;
279  vio = std::fabs(grad);
280  delta = -grad / d;
281  if (delta > -a)
282  {
283  delta = -a;
284  gain = delta * (grad - 0.5 * d * delta);
285  double g0 = grad - a * d + 2.0 * lambda;
286  if (g0 < 0.0)
287  {
288  double dd = -g0 / d;
289  gain = dd * (grad - 0.5 * d * dd);
290  delta += dd;
291  }
292  }
293  else gain = 0.5 * d * delta * delta;
294  }
295 
296  // update state
297  if (vio > maxvio)
298  maxvio = vio;
299  if (delta != 0.0)
300  {
301  alpha[i] += delta;
302  noalias(w) += delta*row(data,i);
303  }
304 
305  // update gain-based preferences
306  {
307  if (iter == 0)
308  average_gain += gain / (double)dim;
309  else
310  {
311  double change = CHANGE_RATE * (gain / average_gain - 1.0);
312  double newpref = pref[i] * std::exp(change);
313  newpref = std::min(std::max(newpref,PREF_MIN),PREF_MAX);
314  prefsum += newpref - pref[i];
315  pref[i] = newpref;
316  average_gain = (1.0 - gain_learning_rate) * average_gain + gain_learning_rate * gain;
317  }
318  }
319  }
320  iter++;
321 
322  if (maxvio <= m_accuracy)
323  {
324  if (canstop)
325  break;
326  else
327  {
328  // prepare full sweep for a reliable check of the stopping criterion
329  canstop = 1;
330  noalias(pref) = blas::repeat(10,dim);
331  prefsum = (double)dim;
332  if (verbose) std::cout << "*" << std::flush;
333  }
334  }
335  else
336  {
337  canstop = 0;
338  if (verbose) std::cout << "." << std::flush;
339  }
340  }
341  }
342 
343  double m_lambda; ///< regularization parameter
344  double m_accuracy; ///< gradient accuracy
345  std::size_t dim; ///< dimension; number of features
346  std::size_t ell; ///< number of points
347  RealVector label; ///< dense label vector, one entry per point
348  typename Batch<InputVectorType>::type data; ///< matrix of sparse vectors, one row per feature
349 };
350 
351 
352 }
353 #endif