ArdKernel.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Gaussian automatic relevance detection (ARD) kernel
6  *
7  *
8  *
9  * \author T.Glasmachers, O. Krause, M. Tuma
10  * \date 2010-2012
11  *
12  *
13  * \par Copyright 1995-2015 Shark Development Team
14  *
15  * <BR><HR>
16  * This file is part of Shark.
17  * <http://image.diku.dk/shark/>
18  *
19  * Shark is free software: you can redistribute it and/or modify
20  * it under the terms of the GNU Lesser General Public License as published
21  * by the Free Software Foundation, either version 3 of the License, or
22  * (at your option) any later version.
23  *
24  * Shark is distributed in the hope that it will be useful,
25  * but WITHOUT ANY WARRANTY; without even the implied warranty of
26  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27  * GNU Lesser General Public License for more details.
28  *
29  * You should have received a copy of the GNU Lesser General Public License
30  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
31  *
32  */
33 //===========================================================================
34 
35 #ifndef SHARK_MODELS_KERNELS_GAUSSIAN_ARD_KERNEL_H
36 #define SHARK_MODELS_KERNELS_GAUSSIAN_ARD_KERNEL_H
37 
38 
40 namespace shark {
41 
42 
43 /// \brief Automatic relevance detection kernel for unconstrained parameter optimization
44 ///
45 /// The automatic relevance detection (ARD) kernel is a general Gaussian kernel with
46 /// diagonal covariance matrix:
47 /// \f$ k(x, z) = \exp(-\sum_i \gamma_i (x_i - z_i)^2) \f$.
48 /// The ARD kernel holds one real-valued parameter \f$ \gamma_i \f$ per input dimension.
49 /// The parameters \f$ p_i \f$ are encoded as \f$ p_i^2 = \gamma_i \f$, allowing for unconstrained
50 /// optimization. Here, the exposed/visible parameters are squared before being used in the
51 /// actual computations, because the otherwise Shark-canonical exponential encoding proved
52 /// a bit unstable, that is, leading to too big step sizes, in preliminary experiments.
53 ///
54 /// Note that, like all or most models/kernels designed for unconstrained optimization, the
55 /// argument to the constructor corresponds to the value of the true weights, while the set
56 /// and get methods for the parameter vector set the parameterized values and not the true weights.
57 ///
58 /// \todo slow default implementation. Use BLAS3 calls to make things faster
59 template<class InputType=RealVector>
61 {
62 private:
64 
65  struct InternalState: public State{
66  RealMatrix kxy;
67 
68  void resize(std::size_t sizeX1,std::size_t sizeX2){
69  kxy.resize(sizeX1,sizeX2);
70  }
71  };
72 public:
76 
77  /// Constructor
78  /// \param dim input dimension
79  /// \param gamma_init initial gamma value for all dimensions (true value, used as passed into ctor)
80  ARDKernelUnconstrained(unsigned int dim, double gamma_init = 1.0){
81  SHARK_CHECK( gamma_init > 0, "[ARDKernelUnconstrained::ARDKernelUnconstrained] Expected positive weight.");
82 
83  //init abstract model's informational flags
87 
88  //initialize self
89  m_inputDimensions = dim;
92  double sqrt_gamma = std::sqrt( gamma_init );
93  for ( unsigned int i=0; i<m_inputDimensions; i++ ){
94  m_gammas(i) = gamma_init;
95  m_params(i) = sqrt_gamma;
96  }
97  }
98 
99  /// \brief From INameable: return the class name.
100  std::string name() const
101  { return "ARDKernelUnconstrained"; }
102 
103  RealVector parameterVector() const{
104  return m_params;
105  }
106  void setParameterVector(RealVector const& newParameters){
107  SIZE_CHECK(newParameters.size() == m_inputDimensions);
108  m_params = newParameters;
109  noalias(m_gammas) = sqr(m_params);
110  }
111  std::size_t numberOfParameters() const{
112  return m_inputDimensions;
113  }
114 
115  ///\brief creates the internal state of the kernel
116  boost::shared_ptr<State> createState()const{
117  return boost::shared_ptr<State>(new InternalState());
118  }
119 
120  /// convenience methods for setting/getting the actual gamma values
121  RealVector gammaVector() const{
122  return m_gammas;
123  }
124  void setGammaVector( RealVector const& newGammas ) {
125 #ifndef DNDEBUG
126  SIZE_CHECK( newGammas.size() == m_inputDimensions );
127  for ( unsigned int i=0; i<m_inputDimensions; i++ ) {
128  RANGE_CHECK( newGammas(i) > 0 );
129  }
130 #endif
131  m_gammas = newGammas;
132  noalias(m_params) = sqrt(m_gammas);
133  }
134 
135  /// \brief evaluates \f$ k(x,z)\f$
136  ///
137  /// ARD kernel evaluation
138  /// \f[ k(x, z) = \exp(-\sum_i \gamma_i (x_i - z_i)^2) \f]
139  double eval(ConstInputReference x1, ConstInputReference x2) const{
140  SIZE_CHECK(x1.size() == x2.size());
141  SIZE_CHECK(x1.size() == m_inputDimensions);
142  double dmnorm2 = diagonalMahalanobisDistanceSqr(x1, x2, m_gammas);
143  return std::exp(-dmnorm2);
144  }
145 
146  /// \brief evaluates \f$ k(x,z)\f$ for a whole batch
147  ///
148  /// ARD kernel evaluation
149  /// \f[ k(x, z) = \exp(-\sum_i \gamma_i (x_i - z_i)^2) \f]
150  void eval(ConstBatchInputReference batchX1, ConstBatchInputReference batchX2, RealMatrix& result) const{
151  SIZE_CHECK(batchX1.size2() == batchX2.size2());
152  SIZE_CHECK(batchX1.size2() == m_inputDimensions);
153 
154  std::size_t sizeX1 = batchX1.size1();
155  std::size_t sizeX2 = batchX2.size1();
156 
157  ensure_size(result,sizeX1,sizeX2);
158  //todo: implement fast version of diagonalMahalanobisDistanceSqr for matrices
159  for(std::size_t i = 0; i != sizeX1; ++i){
160  for(std::size_t j = 0; j != sizeX2; ++j){
161  double dmnorm2 = diagonalMahalanobisDistanceSqr(row(batchX1,i), row(batchX2,j), m_gammas);
162  result(i,j)=std::exp(-dmnorm2);
163  }
164  }
165  }
166 
167  /// \brief evaluates \f$ k(x,z)\f$ for a whole batch
168  ///
169  /// ARD kernel evaluation
170  /// \f[ k(x, z) = \exp(-\sum_i \gamma_i (x_i - z_i)^2) \f]
171  void eval(ConstBatchInputReference batchX1, ConstBatchInputReference batchX2, RealMatrix& result, State& state) const{
172  SIZE_CHECK(batchX1.size2() == batchX2.size2());
173  SIZE_CHECK(batchX1.size2() == m_inputDimensions);
174 
175  std::size_t sizeX1 = batchX1.size1();
176  std::size_t sizeX2 = batchX2.size1();
177 
178  InternalState& s = state.toState<InternalState>();
179  s.resize(sizeX1,sizeX2);
180 
181  ensure_size(result,sizeX1,sizeX2);
182  //todo: implement fast version of diagonalMahalanobisDistanceSqr for matrices
183  for(std::size_t i = 0; i != sizeX1; ++i){
184  for(std::size_t j = 0; j != sizeX2; ++j){
185  double dmnorm2 = diagonalMahalanobisDistanceSqr(row(batchX1,i), row(batchX2,j), m_gammas);
186  result(i,j) = std::exp(-dmnorm2);
187  s.kxy(i,j) = result(i,j);
188  }
189  }
190  }
191 
192  /// \brief evaluates \f$ \frac {\partial k(x,z)}{\partial \sqrt{\gamma_i}}\f$ weighted over a whole batch
193  ///
194  /// Since the ARD kernel is parametrized for unconstrained optimization, we return
195  /// the derivative w.r.t. the parameters \f$ p_i \f$, where \f$ p_i^2 = \gamma_i \f$.
196  ///
197  /// \f[ \frac {\partial k(x,z)}{\partial p_i} = -2 p_i (x_i - z_i)^2 \cdot k(x,z) \f]
199  ConstBatchInputReference batchX1,
200  ConstBatchInputReference batchX2,
201  RealMatrix const& coefficients,
202  State const& state,
203  RealVector& gradient
204  ) const{
205  SIZE_CHECK(batchX1.size2() == batchX2.size2());
206  SIZE_CHECK(batchX1.size2() == m_inputDimensions);
207 
208  std::size_t sizeX1 = batchX1.size1();
209  std::size_t sizeX2 = batchX2.size1();
210 
211  ensure_size(gradient, m_inputDimensions );
212  gradient.clear();
213  InternalState const& s = state.toState<InternalState>();
214 
215  for(std::size_t i = 0; i != sizeX1; ++i){
216  for(std::size_t j = 0; j != sizeX2; ++j){
217  double coeff = coefficients(i,j) * s.kxy(i,j);
218  gradient += coeff * m_params * sqr(row(batchX1,i)-row(batchX2,j));
219  }
220  }
221  gradient *= -2;
222  }
223 
224  /// \brief evaluates \f$ \frac {\partial k(x,z)}{\partial x}\f$
225  ///
226  /// first derivative of ARD kernel wrt the first input pattern
227  /// \f[ \frac {\partial k(x,z)}{\partial x} = -2 \gamma_i \left( x_i - z_i \right)\cdot k(x,z) \f]
229  ConstBatchInputReference batchX1,
230  ConstBatchInputReference batchX2,
231  RealMatrix const& coefficientsX2,
232  State const& state,
233  BatchInputType& gradient
234  ) const{
235  SIZE_CHECK(batchX1.size2() == batchX2.size2());
236  SIZE_CHECK(batchX1.size2() == m_inputDimensions);
237 
238  std::size_t sizeX1 = batchX1.size1();
239  std::size_t sizeX2 = batchX2.size1();
240 
241  InternalState const& s = state.toState<InternalState>();
242  ensure_size(gradient, sizeX1, m_inputDimensions );
243  gradient.clear();
244 
245  for(std::size_t i = 0; i != sizeX1; ++i){
246  for(std::size_t j = 0; j != sizeX2; ++j){
247  double coeff = coefficientsX2(i,j) * s.kxy(i,j);
248  row(gradient,i) += coeff * m_gammas * (row(batchX1,i)-row(batchX2,j));
249  }
250  }
251  gradient *= -2.0;
252  }
253 
254  void read(InArchive& ar){
255  ar >> m_gammas;
256  ar >> m_inputDimensions;
257  m_params.resize(m_inputDimensions);
258  noalias(m_params) = sqrt(m_gammas);
259  }
260 
261  void write(OutArchive& ar) const{
262  ar << m_gammas;
263  ar << m_inputDimensions;
264  }
265 
266 protected:
267  RealVector m_gammas; ///< kernel bandwidth parameters, one for each input dimension. squares of m_params.
268  RealVector m_params; ///< parameters as seen by the external optimizer (for unconstrained optimization). can be negative.
269  std::size_t m_inputDimensions; ///< how many input dimensions = how many bandwidth parameters
270 };
271 
275 
276 }
277 #endif