KernelTargetAlignment.h
Go to the documentation of this file.
1 /*!
2  *
3  *
4  * \brief Kernel Target Alignment - a measure of alignment of a kernel Gram matrix with labels.
5  *
6  *
7  *
8  * \author T. Glasmachers, O.Krause
9  * \date 2010-2012
10  *
11  *
12  * \par Copyright 1995-2015 Shark Development Team
13  *
14  * <BR><HR>
15  * This file is part of Shark.
16  * <http://image.diku.dk/shark/>
17  *
18  * Shark is free software: you can redistribute it and/or modify
19  * it under the terms of the GNU Lesser General Public License as published
20  * by the Free Software Foundation, either version 3 of the License, or
21  * (at your option) any later version.
22  *
23  * Shark is distributed in the hope that it will be useful,
24  * but WITHOUT ANY WARRANTY; without even the implied warranty of
25  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26  * GNU Lesser General Public License for more details.
27  *
28  * You should have received a copy of the GNU Lesser General Public License
29  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
30  *
31  */
32 #ifndef SHARK_OBJECTIVEFUNCTIONS_KERNELTARGETALIGNMENT_H
33 #define SHARK_OBJECTIVEFUNCTIONS_KERNELTARGETALIGNMENT_H
34 
36 #include <shark/Data/Dataset.h>
37 #include <shark/Data/Statistics.h>
39 
40 
41 namespace shark{
42 
43 /*!
44  * \brief Kernel Target Alignment - a measure of alignment of a kernel Gram matrix with labels.
45  *
46  * \par
47  * The Kernel Target Alignment (KTA) was originally proposed in the paper:<br/>
48  * <i>On Kernel-Target Alignment</i>. N. Cristianini, J. Shawe-Taylor,
49  * A. Elisseeff, J. Kandola. Innovations in Machine Learning, 2006.<br/>
50  * Here we provide a version with centering of the features as proposed
51  * in the paper:<br/>
52  * <i>Two-Stage Learning Kernel Algorithms</i>. C. Cortes, M. Mohri,
53  * A. Rostamizadeh. ICML 2010.<br/>
54  *
55  * \par
56  * The kernel target alignment is defined as
57  * \f[ \hat A = \frac{\langle K, y y^T \rangle}{\sqrt{\langle K, K \rangle \cdot \langle y y^T, y y^T \rangle}} \f]
58  * where K is the kernel Gram matrix of the data and y is the vector of
59  * +1/-1 valued labels. The outer product \f$ y y^T \f$ corresponds to
60  * an &quot;ideal&quot; Gram matrix corresponding to a kernel that maps
61  * the two classes each to a single point, thus minimizing within-class
62  * distance for fixed inter-class distance. The inner products denote the
63  * Frobenius product of matrices:
64  * http://en.wikipedia.org/wiki/Matrix_multiplication#Frobenius_product
65  *
66  * \par
67  * In kernel-based learning, the kernel Gram matrix K is of the form
68  * \f[ K_{i,j} = k(x_i, x_j) = \langle \phi(x_i), \phi(x_j) \rangle \f]
69  * for a Mercer kernel function k and inputs \f$ x_i, x_j \f$. In this
70  * version of the KTA we use centered feature vectors. Let
71  * \f[ \psi(x_i) = \phi(x_i) - \frac{1}{\ell} \sum_{j=1}^{\ell} \phi(x_j) \f]
72  * denote the centered feature vectors, then the centered Gram matrix
73  * \f$ K^c \f$ is given by
74  * \f[ K^c_{i,j} = \langle \psi(x_i), \psi(x_j) \rangle = K_{i,j} - \frac{1}{\ell} \sum_{n=1}^\ell K_{i,n} + K_{j,n} + \frac{1}{\ell^2} \sum_{m,n=1}^\ell K_{n,m} \f]
75  * The alignment measure computed by this class is the exact same formula
76  * for \f$ \hat A \f$, but with \f$ K^c \f$ plugged in in place of $\f$ K \f$.
77  *
78  * \par
79  * KTA measures the Frobenius inner product between a kernel Gram matrix
80  * and this ideal matrix. The interpretation is that KTA measures how
81  * well a given kernel fits a classification problem. The actual measure
82  * is invariant under kernel rescaling.
83  * In Shark, objective functions are minimized by convention. Therefore
84  * the negative alignment \f$ - \hat A \f$ is implemented. The measure is
85  * extended for multi-class problems by using prototype vectors instead
86  * of scalar labels.
87  *
88  * \par
89  * The following properties of KTA are important from a model selection
90  * point of view: it is relatively fast and easy to compute, it is
91  * differentiable w.r.t. the kernel function, and it is independent of
92  * the actual classifier.
93  *
94  * \par
95  * The following notation is used in several of the methods of the class.
96  * \f$ K^c \f$ denotes the centered Gram matrix, y is the vector of labels,
97  * Y is the outer product of this vector with itself, k is the row
98  * (or column) wise average of the uncentered Gram matrix K, my is the
99  * label average, and u is the vector of all ones, and \f$ \ell \f$ is the
100  * number of data points, and thus the size of the Gram matrix.
101  */
102 template<class InputType = RealVector,class LabelType = unsigned int>
104 {
105 private:
106  typedef typename Batch<LabelType>::type BatchLabelType;
107 public:
108  /// \brief Construction of the Kernel Target Alignment (KTA) from a kernel object.
109  ///
110  /// Don't forget to provide a data set with the setDataset method
111  /// before using the object.
113  LabeledData<InputType, LabelType> const& dataset,
115  ){
116  SHARK_CHECK(kernel != NULL, "[KernelTargetAlignment] kernel must not be NULL");
117 
118  mep_kernel = kernel;
119 
122 
123  if(mep_kernel -> hasFirstParameterDerivative())
125 
126  m_data = dataset;
127  m_elements = dataset.numberOfElements();
128 
129 
130  setupY(dataset.labels());
131  }
132 
133  /// \brief From INameable: return the class name.
134  std::string name() const
135  { return "KernelTargetAlignment"; }
136 
137  /// Return the current kernel parameters as a starting point for an optimization run.
139  return mep_kernel -> parameterVector();
140  }
141 
142 
143  std::size_t numberOfVariables()const{
144  return mep_kernel->numberOfParameters();
145  }
146 
147  /// \brief Evaluate the (centered, negative) Kernel Target Alignment (KTA).
148  ///
149  /// See the class description for more details on this computation.
150  double eval(RealVector const& input) const{
151  mep_kernel->setParameterVector(input);
152 
153  return -evaluateKernelMatrix().error;
154  }
155 
156  /// \brief Compute the derivative of the KTA as a function of the kernel parameters.
157  ///
158  /// It holds:
159  /// \f[ \langle K^c, K^c \rangle = \langle K,K \rangle -2 \ell \langle k,k \rangle + mk^2 \ell^2 \\
160  /// (\langle K^c, K^c \rangle )' = 2 \langle K,K' \rangle -4\ell \langle k, \frac{1}{\ell} \sum_j K'_ij \rangle +2 \ell^2 mk \sum_ij 1/(\ell^2) K'_ij \\
161  /// = 2 \langle K,K' \rangle -4 \langle k, \sum_j K'_ij \rangle + 2 mk \sum_ij K_ij \\
162  /// = 2 \langle K,K' \rangle -4 \langle k u^T, K' \rangle + 2 mk \langle u u^T, K' \rangle \\
163  /// = 2\langle K -2 k u^T + mk u u^T, K' \rangle ) \\
164  /// \langle Y, K^c \rangle = \langle Y, K \rangle - 2 n \langle y, k \rangle + n^2 my mk \\
165  /// (\langle Y , K^c \rangle )' = \langle Y -2 y u^T + my u u^T, K' \rangle \f]
166  /// now the derivative is computed from this values in a second sweep over the data:
167  /// we get:
168  /// \f[ \hat A' = 1/\langle K^c,K^c \rangle ^{3/2} (\langle K^c,K^c \rangle (\langle Y,K^c \rangle )' - 0.5*\langle Y, K^c \rangle (\langle K^c , K^c \rangle )') \\
169  /// = 1/\langle K^c,K^c \rangle ^{3/2} \langle \langle K^c,K^c \rangle (Y -2 y u^T + my u u^T)- \langle Y, K^c \rangle (K -2 k u^T+ mk u u^T),K' \rangle \\
170  /// = 1/\langle K^c,K^c \rangle ^{3/2} \langle W,K' \rangle \f]
171  ///reordering rsults in
172  /// \f[ W= \langle K^c,K^c \rangle Y - \langle Y, K^c \rangle K \\
173  /// - 2 (\langle K^c,K^c \rangle y - \langle Y, K^c \rangle k) u^T \\
174  /// + (\langle K^c,K^c \rangle my - \langle Y, K^c \rangle mk) u u^T \f]
175  /// where \f$ K' \f$ is the derivative of K with respct of the kernel parameters.
176  ResultType evalDerivative( const SearchPointType & input, FirstOrderDerivative & derivative ) const {
177  mep_kernel->setParameterVector(input);
178  // the drivative is calculated in two sweeps of the data. first the required statistics
179  // \langle K^c,K^c \rangle , mk and k are evaluated exactly as in eval
180 
181  KernelMatrixResults results = evaluateKernelMatrix();
182 
183  std::size_t parameters = mep_kernel->numberOfParameters();
184  derivative.resize(parameters);
185  derivative.clear();
186  SHARK_PARALLEL_FOR(int i = 0; i < (int)m_data.numberOfBatches(); ++i){
187  std::size_t startX = 0;
188  for(int j = 0; j != i; ++j){
189  startX+= size(m_data.batch(j));
190  }
191  RealVector threadDerivative(parameters,0.0);
192  RealVector blockDerivative;
193  boost::shared_ptr<State> state = mep_kernel->createState();
194  RealMatrix blockK;//block of the KernelMatrix
195  RealMatrix blockW;//block of the WeightMatrix
196  std::size_t startY = 0;
197  for(int j = 0; j <= i; ++j){
198  mep_kernel->eval(m_data.batch(i).input,m_data.batch(j).input,blockK,*state);
199  mep_kernel->weightedParameterDerivative(
200  m_data.batch(i).input,m_data.batch(j).input,
201  generateDerivativeWeightBlock(i,j,startX,startY,blockK,results),//takes symmetry into account
202  *state,
203  blockDerivative
204  );
205  noalias(threadDerivative) += blockDerivative;
206  startY += size(m_data.batch(j));
207  }
209  noalias(derivative) += threadDerivative;
210  }
211  }
212  derivative *= -1;
213  return -results.error;
214  }
215 
216 private:
217  AbstractKernelFunction<InputType>* mep_kernel; ///< kernel function
218  LabeledData<InputType,LabelType> m_data; ///< data points
219  RealVector m_columnMeanY; ///< mean label vector
220  double m_meanY; ///< mean label element
221  std::size_t m_numberOfClasses; ///< number of classes
222  std::size_t m_elements; ///< number of data points
223 
224  struct KernelMatrixResults{
225  RealVector k;
226  double KcKc;
227  double YcKc;
228  double error;
229  double meanK;
230  };
231 
232  void setupY(Data<unsigned int>const& labels){
233  //preprocess Y so calculate column means and overall mean
234  //the most efficient way to do this is via the class counts
235  std::vector<std::size_t> classCount = classSizes(labels);
236  m_numberOfClasses = classCount.size();
237  RealVector classMean(m_numberOfClasses);
238  double dm1 = m_numberOfClasses-1.0;
239  for(std::size_t i = 0; i != m_numberOfClasses; ++i){
240  classMean(i) = classCount[i]-(m_elements-classCount[i])/dm1;
241  classMean /= m_elements;
242  }
243 
244  m_columnMeanY.resize(m_elements);
245  for(std::size_t i = 0; i != m_elements; ++i){
246  m_columnMeanY(i) = classMean(labels.element(i));
247  }
248  m_meanY=sum(m_columnMeanY)/m_elements;
249  }
250 
251  void setupY(Data<RealVector>const& labels){
252  RealVector meanLabel = mean(labels);
253  m_columnMeanY.resize(m_elements);
254  for(std::size_t i = 0; i != m_elements; ++i){
255  m_columnMeanY(i) = inner_prod(labels.element(i),meanLabel);
256  }
257  m_meanY=sum(m_columnMeanY)/m_elements;
258  }
259 
260  /// Update a sub-block of the matrix \f$ \langle Y, K^x \rangle \f$.
261  double updateYKc(UIntVector const& labelsi,UIntVector const& labelsj, RealMatrix const& block)const{
262  std::size_t blockSize1 = labelsi.size();
263  std::size_t blockSize2 = labelsj.size();
264  //todo optimize the i=j case
265  double result = 0;
266  double dm1 = m_numberOfClasses-1.0;
267  for(std::size_t k = 0; k != blockSize1; ++k){
268  for(std::size_t l = 0; l != blockSize2; ++l){
269  if(labelsi(k) == labelsj(l))
270  result += block(k,l);
271  else
272  result -= block(k,l)/dm1;
273  }
274  }
275  return result;
276  }
277 
278  /// Update a sub-block of the matrix \f$ \langle Y, K^x \rangle \f$.
279  double updateYKc(RealMatrix const& labelsi,RealMatrix const& labelsj, RealMatrix const& block)const{
280  std::size_t blockSize1 = labelsi.size1();
281  std::size_t blockSize2 = labelsj.size1();
282  //todo optimize the i=j case
283  double result = 0;
284  for(std::size_t k = 0; k != blockSize1; ++k){
285  for(std::size_t l = 0; l != blockSize2; ++l){
286  double y_kl = inner_prod(row(labelsi,k),row(labelsj,l));
287  result += y_kl*block(k,l);
288  }
289  }
290  return result;
291  }
292 
293  void computeBlockY(UIntVector const& labelsi,UIntVector const& labelsj, RealMatrix& blockY)const{
294  std::size_t blockSize1 = labelsi.size();
295  std::size_t blockSize2 = labelsj.size();
296  double dm1 = m_numberOfClasses-1.0;
297  for(std::size_t k = 0; k != blockSize1; ++k){
298  for(std::size_t l = 0; l != blockSize2; ++l){
299  if( labelsi(k) == labelsj(l))
300  blockY(k,l) = 1;
301  else
302  blockY(k,l) = -1.0/dm1;
303  }
304  }
305  }
306 
307  void computeBlockY(RealMatrix const& labelsi,RealMatrix const& labelsj, RealMatrix& blockY)const{
308  std::size_t blockSize1 = labelsi.size1();
309  std::size_t blockSize2 = labelsj.size1();
310  for(std::size_t k = 0; k != blockSize1; ++k){
311  for(std::size_t l = 0; l != blockSize2; ++l){
312  blockY(k,l) = inner_prod(row(labelsi,k),row(labelsj,l));
313  }
314  }
315  }
316 
317  /// Compute a sub-block of the matrix
318  /// \f[ W = \langle K^c, K^c \rangle Y - \langle Y, K^c \rangle K -2 (\langle K^c, K^c \rangle y - \langle Y, K^c \rangle k) u^T + (\langle K^c, K^c \rangle my - \langle Y, K^c \rangle mk) u u^T \f]
319  RealMatrix generateDerivativeWeightBlock(
320  std::size_t i, std::size_t j,
321  std::size_t start1, std::size_t start2,
322  RealMatrix const& blockK,
323  KernelMatrixResults const& matrixStatistics
324  )const{
325  std::size_t blockSize1 = size(m_data.batch(i));
326  std::size_t blockSize2 = size(m_data.batch(j));
327  //double n = m_elements;
328  double KcKc = matrixStatistics.KcKc;
329  double YcKc = matrixStatistics.YcKc;
330  double meanK = matrixStatistics.meanK;
331  RealMatrix blockW(blockSize1,blockSize2);
332 
333  //first calculate \langle Kc,Kc \rangle Y.
334  computeBlockY(m_data.batch(i).label,m_data.batch(j).label,blockW);
335  blockW *= KcKc;
336  //- \langle Y,K^c \rangle K
337  blockW-=YcKc*blockK;
338  // -2(\langle Kc,Kc \rangle y -\langle Y, K^c \rangle k) u^T
339  // implmented as: -(\langle K^c,K^c \rangle y -2\langle Y, K^c \rangle k) u^T -u^T(\langle K^c,K^c \rangle y -2\langle Y, K^c \rangle k)^T
340  //todo find out why this is correct and the calculation above is not.
341  blockW-=repeat(subrange(KcKc*m_columnMeanY - YcKc*matrixStatistics.k,start2,start2+blockSize2),blockSize1);
342  blockW-=trans(repeat(subrange(KcKc*m_columnMeanY - YcKc*matrixStatistics.k,start1,start1+blockSize1),blockSize2));
343  // + (\langle Kc,Kc \rangle my-2\langle Y, Kc \rangle mk) u u^T
344  blockW+= KcKc*m_meanY-YcKc*meanK;
345  blockW /= KcKc*std::sqrt(KcKc);
346  //std::cout<<blockW<<std::endl;
347  //symmetry
348  if(i != j)
349  blockW *= 2.0;
350  return blockW;
351  }
352 
353  /// \brief Evaluate the centered kernel Gram matrix.
354  ///
355  /// The computation is as follows. By means of a
356  /// number of identities it holds
357  /// \f[ \langle K^c, K^c \rangle = \\
358  /// \langle K^c, K^c \rangle = \langle K,K \rangle -2n\langle k,k \rangle +mk^2n^2 \\
359  /// \langle K^c, Y \rangle = \langle K, Y \rangle - 2 n \langle k, y \rangle + n^2 mk my \f]
360  /// where k is the row mean over K and y the row mean over y, mk, my the total means of K and Y
361  /// and n the number of elements
362  KernelMatrixResults evaluateKernelMatrix()const{
363  //it holds
364  // \langle K^c,K^c \rangle = \langle K,K \rangle -2n\langle k,k \rangle +mk^2n^2
365  // \langle K^c,Y \rangle = \langle K, Y \rangle - 2 n \langle k, y \rangle + n^2 mk my
366  // where k is the row mean over K and y the row mean over y, mk, my the total means of K and Y
367  // and n the number of elements
368 
369  double KK = 0; //stores \langle K,K \rangle
370  double YKc = 0; //stores \langle Y,K^c \rangle
371  RealVector k(m_elements,0.0);//stores the row/column means of K
372  SHARK_PARALLEL_FOR(int i = 0; i < (int)m_data.numberOfBatches(); ++i){
373  std::size_t startRow = 0;
374  for(int j = 0; j != i; ++j){
375  startRow+= size(m_data.batch(j));
376  }
377  std::size_t rowSize = size(m_data.batch(i));
378  double threadKK = 0;
379  double threadYKc = 0;
380  RealVector threadk(m_elements,0.0);
381  std::size_t startColumn = 0; //starting column of the current block
382  for(int j = 0; j <= i; ++j){
383  std::size_t columnSize = size(m_data.batch(j));
384  RealMatrix blockK = (*mep_kernel)(m_data.batch(i).input,m_data.batch(j).input);
385  if(i == j){
386  threadKK += frobenius_prod(blockK,blockK);
387  subrange(threadk,startColumn,startColumn+columnSize)+=sum_rows(blockK);//update sum_rows(K)
388  threadYKc += updateYKc(m_data.batch(i).label,m_data.batch(j).label,blockK);
389  }
390  else{//use symmetry ok K
391  threadKK += 2.0 * frobenius_prod(blockK,blockK);
392  subrange(threadk,startColumn,startColumn+columnSize)+=sum_rows(blockK);
393  subrange(threadk,startRow,startRow+rowSize)+=sum_columns(blockK);//symmetry: block(j,i)
394  threadYKc += 2.0 * updateYKc(m_data.batch(i).label,m_data.batch(j).label,blockK);
395  }
396  startColumn+=columnSize;
397  }
399  KK += threadKK;
400  YKc +=threadYKc;
401  noalias(k) +=threadk;
402  }
403  }
404  //calculate the error
405  double n = m_elements;
406  k /= n;//means
407  double meanK = sum(k)/n;
408  double n2 = sqr(n);
409  double YcKc = YKc-2.0*n*inner_prod(k,m_columnMeanY)+n2*m_meanY*meanK;
410  double KcKc = KK - 2.0*n*inner_prod(k,k)+n2*sqr(meanK);
411 
412  KernelMatrixResults results;
413  results.k=k;
414  results.YcKc = YcKc;
415  results.KcKc = KcKc;
416  results.meanK = meanK;
417  results.error = YcKc/std::sqrt(KcKc);
418  return results;
419  }
420 };
421 
422 
423 }
424 #endif