ExportKernelMatrix.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief export precomputed kernel matrices (using libsvm format)
6  *
7  *
8  *
9  * \author M. Tuma
10  * \date 2012
11  *
12  *
13  * \par Copyright 1995-2015 Shark Development Team
14  *
15  * <BR><HR>
16  * This file is part of Shark.
17  * <http://image.diku.dk/shark/>
18  *
19  * Shark is free software: you can redistribute it and/or modify
20  * it under the terms of the GNU Lesser General Public License as published
21  * by the Free Software Foundation, either version 3 of the License, or
22  * (at your option) any later version.
23  *
24  * Shark is distributed in the hope that it will be useful,
25  * but WITHOUT ANY WARRANTY; without even the implied warranty of
26  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27  * GNU Lesser General Public License for more details.
28  *
29  * You should have received a copy of the GNU Lesser General Public License
30  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
31  *
32  */
33 //===========================================================================
34 
35 #ifndef SHARK_DATA_PRECOMPUTEDMATRIX_H
36 #define SHARK_DATA_PRECOMPUTEDMATRIX_H
37 
38 
39 
40 #include <fstream>
41 #include <shark/Data/Dataset.h>
42 #include <shark/Data/DataView.h>
46 
47 
48 namespace shark
49 {
50 
51 /**
52  * \ingroup shark_globals
53  *
54  * @{
55  */
56 
58 {
59  NONE, // no normalization. output regular Gram kernel matrix
60  MULTIPLICATIVE_TRACE_ONE, // determine the trace, and devide each entry by it
61  MULTIPLICATIVE_TRACE_N, // determine the trace, devide each entry by it, then multiply by the number of samples
62  MULTIPLICATIVE_VARIANCE_ONE, // normalize to unit variance in feature space. see kloft in jmlr 2012.
63  CENTER_ONLY, // center the kernel in feature space. see cortes in jmlr 2012 and in icml 2010.
64  CENTER_AND_MULTIPLICATIVE_TRACE_ONE // first center the kernel in featrue space. then devide each entry by the centered kernel's trace.
65 };
66 
67 /// \brief Write a kernel Gram matrix to stream.
68 ///
69 /// \param dataset data basis for the Gram matrix
70 /// \param kernel pointer to kernel function to be used
71 /// \param out The stream to be written to
72 /// \param normalizer what kind of normalization to apply. see enum declaration for details.
73 /// \param scientific should the output be in scientific notation?
74 /// \param fieldwidth field width for pretty printing
75 template<typename InputType, typename LabelType>
77  LabeledData<InputType, LabelType> const &dataset,
78  AbstractKernelFunction<InputType> &kernel, // kernel function (can't be const b/c of ScaledKernel later)
79  std::ostream &out, // The stream to be written to
80  KernelMatrixNormalizationType normalizer = NONE, // what kind of normalization to apply. see enum declaration for details.
81  bool scientific = false, // scientific notation?
82  unsigned int fieldwidth = 0 // for pretty-printing
83 )
84 {
85  //get access to the range of elements
87  typedef typename Elements::reference ElementRef;
88  DataView<LabeledData<InputType, LabelType> const> points(dataset);
89  std::size_t size = points.size();
90 
91  SIZE_CHECK(size != 0);
92  // check outstream status
93  if(!out)
94  {
95  throw(std::invalid_argument("[export_kernel_matrix] Can't write to stream."));
96  }
97 
98  // COMPUTE MODIFIERS
99 
100  // if multiplicative trace normalization: determine trace
101  double trace = 0.0;
102  double trace_factor = 1.0;
103  if(normalizer == MULTIPLICATIVE_TRACE_ONE || normalizer == MULTIPLICATIVE_TRACE_N)
104  {
105  BOOST_FOREACH(ElementRef point, points)
106  {
107  trace += kernel.eval(point.input, point.input);
108  }
109  SHARK_ASSERT(trace > 0);
110  trace_factor = 1.0 / trace;
111  if(normalizer == MULTIPLICATIVE_TRACE_N)
112  {
113  trace_factor *= size;
114  }
115  }
116 
117  // if multiplicative variance normalization: determine factor
118  double variance_factor = 0.0;
119  if(normalizer == MULTIPLICATIVE_VARIANCE_ONE)
120  {
121  ScaledKernel<InputType> scaled(&kernel);
123  normalizer.train(scaled, dataset.inputs());
124  variance_factor = scaled.factor();
125  }
126 
127  // if centering: determine matrix- and row-wise means;
128  double mean = 0;
129  RealVector rowmeans(size, 0.0);
130  if(normalizer == CENTER_ONLY || normalizer == CENTER_AND_MULTIPLICATIVE_TRACE_ONE)
131  {
132  // initialization: calculate mean and rowmeans
133  for(std::size_t i = 0; i < size; i++)
134  {
135  double k = kernel.eval(points[i].input, points[i].input);
136  mean += k; //add diagonal value to mean once
137  rowmeans(i) += k; //add diagonal to its rowmean
138  for(std::size_t j = 0; j < i; j++)
139  {
140  double k = kernel.eval(points[i].input, points[j].input);
141  mean += 2.0 * k; //add off-diagonals to mean twice
142  rowmeans(i) += k; //add to mean of row
143  rowmeans(j) += k; //add to mean of transposed row
144  }
145  }
146  mean = mean / (double) size / (double) size;
147  rowmeans /= size;
148  // get trace if necessary
149  if(normalizer == CENTER_AND_MULTIPLICATIVE_TRACE_ONE)
150  {
151  trace = 0.0;
152  for(std::size_t i = 0; i < size; i++)
153  {
154  trace += kernel.eval(points[i].input, points[i].input) - 2 * rowmeans(i) + mean;
155  }
156  SHARK_ASSERT(trace > 0);
157  trace_factor = 1.0 / trace;
158  }
159  }
160 
161  // FIX OUTPUT FORMAT
162 
163  // set output format
164  if(scientific)
165  out.setf(std::ios_base::scientific);
166  std::streamsize ss = out.precision();
167  out.precision(10);
168 
169  // determine dataset type
170  double max_label = -1e100;
171  double min_label = -max_label;
172  bool binary = false;
173  bool regression = false;
174  BOOST_FOREACH(double cur_label, dataset.labels().elements())
175  {
176  if(cur_label > max_label)
177  max_label = cur_label;
178  if(cur_label < min_label)
179  min_label = cur_label;
180  if((cur_label != (int)cur_label) || cur_label < 0)
181  regression = true;
182  }
183  if(!regression && (min_label == 0) && (max_label == 1))
184  binary = true;
185 
186  // WRITE OUT
187 
188  // write to file:
189  // loop through examples (rows)
190  for(std::size_t i = 0; i < size; i++)
191  {
192 
193  // write label
194  if(regression)
195  {
196  out << std::setw(fieldwidth) << std::left << points[i].label << " ";
197  }
198  else if(binary)
199  {
200  out << std::setw(fieldwidth) << std::left << (int)(points[i].label * 2 - 1) << " ";
201  }
202  else
203  {
204  out << std::setw(fieldwidth) << std::left << (unsigned int)(points[i].label + 1) << " ";
205  }
206 
207  out << "0:" << std::setw(fieldwidth) << std::left << i + 1; //write index
208 
209  // loop through examples (columns)
210  // CASE DISTINCTION:
211  if(normalizer == NONE)
212  {
213  for(std::size_t j = 0; j < size; j++)
214  {
215  out << " " << j + 1 << ":" << std::setw(fieldwidth) << std::left << kernel.eval(points[i].input, points[j].input);
216  }
217  out << "\n";
218  }
219  else if(normalizer == MULTIPLICATIVE_TRACE_ONE || normalizer == MULTIPLICATIVE_TRACE_N)
220  {
221  for(std::size_t j = 0; j < size; j++)
222  {
223  out << " " << j + 1 << ":" << std::setw(fieldwidth) << std::left << trace_factor * kernel.eval(points[i].input, points[j].input);
224  }
225  out << "\n";
226  }
227  else if(normalizer == MULTIPLICATIVE_VARIANCE_ONE)
228  {
229  for(std::size_t j = 0; j < size; j++)
230  {
231  out << " " << j + 1 << ":" << std::setw(fieldwidth) << std::left << variance_factor *kernel.eval(points[i].input, points[j].input);
232  }
233  out << "\n";
234  }
235  else if(normalizer == CENTER_ONLY)
236  {
237  for(std::size_t j = 0; j < size; j++)
238  {
239  double tmp = kernel.eval(points[i].input, points[j].input) - rowmeans(i) - rowmeans(j) + mean;
240  out << " " << j + 1 << ":" << std::setw(fieldwidth) << std::left << tmp;
241  }
242  out << "\n";
243  }
244  else if(normalizer == CENTER_AND_MULTIPLICATIVE_TRACE_ONE)
245  {
246  for(std::size_t j = 0; j < size; j++)
247  {
248  double tmp = kernel.eval(points[i].input, points[j].input) - rowmeans(i) - rowmeans(j) + mean;
249  out << " " << j + 1 << ":" << std::setw(fieldwidth) << std::left << trace_factor *tmp;
250  }
251  out << "\n";
252  }
253  else
254  {
255  throw SHARKEXCEPTION("[detail::export_kernel_matrix] Unknown normalization type.");
256  }
257 
258  }
259 
260  // clean up
261  out.precision(ss);
262 }
263 
264 
265 
266 /// \brief Write a kernel Gram matrix to file.
267 ///
268 /// \param dataset data basis for the Gram matrix
269 /// \param kernel pointer to kernel function to be used
270 /// \param fn The filename of the file to be written to
271 /// \param normalizer what kind of normalization to apply. see enum declaration for details.
272 /// \param sci should the output be in scientific notation?
273 /// \param width field width for pretty printing
274 template<typename InputType, typename LabelType>
276  LabeledData<InputType, LabelType> const &dataset,
278  std::string fn,
279  KernelMatrixNormalizationType normalizer = NONE,
280  bool sci = false,
281  unsigned int width = 0
282 )
283 {
284  std::ofstream ofs(fn.c_str());
285  if(ofs)
286  {
287  exportKernelMatrix(dataset, kernel, ofs, normalizer, sci, width);
288  }
289  else
290  throw(std::invalid_argument("[detail::export_kernel_matrix] Stream cannot be opened for writing."));
291 
292 }
293 
294 
295 
296 
297 // deprecated wrapper
298 template<typename InputType, typename LabelType>
300  LabeledData<InputType, LabelType> const &dataset,
301  AbstractKernelFunction<InputType> &kernel, // kernel function (can't be const b/c of ScaledKernel later)
302  std::ostream &out, // The stream to be written to
303  KernelMatrixNormalizationType normalizer = NONE, // what kind of normalization to apply. see enum declaration for details.
304  bool scientific = false, // scientific notation?
305  unsigned int fieldwidth = 0 // for pretty-printing
306 )
307 {
308  exportKernelMatrix(dataset, kernel, out, normalizer, scientific, fieldwidth);
309 }
310 
311 
312 // deprecated wrapper
313 template<typename InputType, typename LabelType>
315  LabeledData<InputType, LabelType> const &dataset,
317  std::string fn,
318  KernelMatrixNormalizationType normalizer = NONE,
319  bool sci = false,
320  unsigned int width = 0
321 )
322 {
323  exportKernelMatrix(dataset, kernel, fn, normalizer, sci, width);
324 }
325 
326 
327 
328 // TODO: import functionality is still missing.
329 // when that is done, add tutorial
330 
331 
332 /** @}*/
333 
334 } // namespace shark
335 
336 
337 
338 #endif // SHARK_DATA_PRECOMPUTEDMATRIX_H