DataDistribution.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Learning problems given by analytic distributions.
6  *
7  *
8  *
9  *
10  * \author T. Glasmachers
11  * \date 2006-2013
12  *
13  *
14  * \par Copyright 1995-2015 Shark Development Team
15  *
16  * <BR><HR>
17  * This file is part of Shark.
18  * <http://image.diku.dk/shark/>
19  *
20  * Shark is free software: you can redistribute it and/or modify
21  * it under the terms of the GNU Lesser General Public License as published
22  * by the Free Software Foundation, either version 3 of the License, or
23  * (at your option) any later version.
24  *
25  * Shark is distributed in the hope that it will be useful,
26  * but WITHOUT ANY WARRANTY; without even the implied warranty of
27  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28  * GNU Lesser General Public License for more details.
29  *
30  * You should have received a copy of the GNU Lesser General Public License
31  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
32  *
33  */
34 //===========================================================================
35 
36 
37 #ifndef SHARK_DATA_DATADISTRIBUTION_H
38 #define SHARK_DATA_DATADISTRIBUTION_H
39 
40 #include <shark/Data/Dataset.h>
41 #include <shark/Rng/GlobalRng.h>
43 #include <utility>
44 
45 namespace shark {
46 
47 
48 ///
49 /// \brief A DataDistribution defines an unsupervised learning problem.
50 ///
51 /// \par
52 /// The unsupervised learning problem is defined by an explicit
53 /// distribution (in contrast to a finite dataset). The only
54 /// method we need is to draw a sample from the distribution.
55 ///
56 template <class InputType>
58 {
59 public:
60  /// \brief Virtual destructor.
61  virtual ~DataDistribution() { }
62 
63  /// \brief Generates a single pair of input and label.
64  ///
65  /// @param input the generated input
66  virtual void draw(InputType& input) const = 0;
67 
68  // \brief Interface for std::generate.
70  InputType ret;
71  draw(ret);
72  return ret;
73  }
74 
75  /// \brief Generates a data set with samples from from the distribution.
76  ///
77  /// @param size the number of samples in the dataset
78  /// @param maximumBatchSize the maximum size of a batch
79  UnlabeledData<InputType> generateDataset(std::size_t size,std::size_t maximumBatchSize) const {
80  std::size_t batches = (size + maximumBatchSize - 1) / maximumBatchSize;
81  std::size_t optimalBatchSize = size / batches;
82  std::size_t remainder = size - batches * optimalBatchSize;
83  UnlabeledData<InputType> dataset(batches);
84  InputType input;
85 
86  // now create and fill the batches, taking the remainder into account
87  for (std::size_t i=0; i<batches; ++i)
88  {
89  std::size_t batchsize = (i<remainder) ? optimalBatchSize + 1 : optimalBatchSize;
90  typename UnlabeledData<InputType>::batch_reference b = dataset.batch(i);
91  draw(input);
92  b = Batch<InputType>::createBatch(input, batchsize);
93  for (std::size_t j=0; j<batchsize; j++)
94  {
95  if (j != 0) draw(input);
96  shark::get(b, j) = input;
97  }
98  }
99  return dataset;
100  }
101 
102  /// \brief Generates a data set with samples from from the distribution.
103  ///
104  /// @param size the number of samples in the dataset
107  }
108 };
109 
110 
111 ///
112 /// \brief A LabeledDataDistribution defines a supervised learning problem.
113 ///
114 /// \par
115 /// The supervised learning problem is defined by an explicit
116 /// distribution (in contrast to a finite dataset). The only
117 /// method we need is to draw a sample from the distribution.
118 ///
119 template <class InputType, class LabelType>
121 {
122 public:
123  /// \brief Virtual destructor.
125 
126  /// \brief Generates a single pair of input and label.
127  /// @param input the generated input
128  /// @param label the generated label
129  virtual void draw(InputType& input, LabelType& label) const = 0;
130 
131  // \Brief Interface for std::generate.
132  std::pair<InputType,LabelType> operator() () {
133  std::pair<InputType,LabelType> ret;
134  draw(ret.first,ret.second);
135  return ret;
136  }
137 
138  /// \brief Generates a dataset with samples from from the distribution.
139  ///
140  /// @param size the number of samples in the dataset
141  /// @param maximumBatchSize the maximum size of a batch
142  LabeledData<InputType, LabelType> generateDataset(std::size_t size,std::size_t maximumBatchSize) const
143  {
144  // first determine the optimal number of batches and their sizes
145  std::size_t batches = (size + maximumBatchSize - 1) / maximumBatchSize;
146  std::size_t optimalBatchSize = size / batches;
147  std::size_t remainder = size - batches * optimalBatchSize;
148  LabeledData<InputType, LabelType> dataset(batches);
149  InputType input;
150  LabelType label;
151  DataPair<InputType, LabelType> pair(input, label);
152 
153  // now create and fill the batches, taking the remainder into account
154  for (std::size_t i=0; i<batches; ++i)
155  {
156  std::size_t batchsize = (i<remainder) ? optimalBatchSize + 1 : optimalBatchSize;
158  draw(input, label); pair.input = input; pair.label = label;
160  for (std::size_t j=0; j<batchsize; j++)
161  {
162  if (j != 0) draw(input, label);
163  shark::get(b, j).input = input;
164  shark::get(b, j).label = label;
165  }
166  }
167  return dataset;
168  }
169 
170  /// \brief Generates a data set with samples from from the distribution.
171  ///
172  /// @param size the number of samples in the dataset
175  }
176 };
177 
178 
179 ///
180 /// \brief "chess board" problem for binary classification
181 ///
182 class Chessboard : public LabeledDataDistribution<RealVector, unsigned int>
183 {
184 public:
185  Chessboard(unsigned int size = 4, double noiselevel = 0.0)
186  {
187  m_size = size;
188  m_noiselevel = noiselevel;
189  }
190 
191 
192  void draw(RealVector& input, unsigned int& label)const{
193  input.resize(2);
194  unsigned int j, t = 0;
195  for (j = 0; j < 2; j++)
196  {
197  double v = Rng::uni(0.0, (double)m_size);
198  t += (int)floor(v);
199  input(j) = v;
200  }
201  label = (t & 1);
202  if (Rng::uni(0.0, 1.0) < m_noiselevel) label = 1 - label;
203  }
204 
205 protected:
206  unsigned int m_size;
207  double m_noiselevel;
208 };
209 
210 
211 ///
212 /// \brief Noisy sinc function: y = sin(x) / x + noise
213 ///
214 class Wave : public LabeledDataDistribution<RealVector, RealVector>
215 {
216 public:
217  Wave(double stddev = 0.1, double range = 5.0){
218  m_stddev = stddev;
219  m_range = range;
220  }
221 
222 
223  void draw(RealVector& input, RealVector& label)const{
224  input.resize(1);
225  label.resize(1);
226  input(0) = Rng::uni(-m_range, m_range);
227  if(input(0) != 0)
228  label(0) = sin(input(0)) / input(0) + Rng::gauss(0.0, m_stddev);
229  else
230  label(0) = Rng::gauss(0.0, m_stddev);
231  }
232 
233 protected:
234  double m_stddev;
235  double m_range;
236 };
237 
238 
239 
240 /// "Pami Toy" problem for binary classification, as used in the article "Glasmachers
241 /// and C. Igel. Maximum Likelihood Model Selection for 1-Norm Soft Margin SVMs with Multiple
242 /// Parameters. IEEE Transactions on Pattern Analysis and Machine Intelligence, 2010."
243 /// In summary, the first M dimensions are correlated to the labels, the last N dimensions
244 /// are not.
245 class PamiToy : public LabeledDataDistribution<RealVector, unsigned int>
246 {
247 public:
248  PamiToy(unsigned int size_useful = 5, unsigned int size_noise = 5, double noise_position = 0.0, double noise_variance = 1.0 )
249  : m_size( size_useful+size_noise ),
250  m_sizeUseful( size_useful ),
251  m_sizeNoise( size_noise ),
252  m_noisePos( noise_position) ,
253  m_noiseVar( noise_variance )
254  { }
255 
256  void draw(RealVector& input, unsigned int& label)const{
257  input.resize( m_size );
258  label = (unsigned int) Rng::discrete(0,1); //fix label first
259  double y2 = label - 0.5; //"clean" informative feature values
260  // now fill the informative features..
261  for ( unsigned int i=0; i<m_sizeUseful; i++ ) {
262  input(i) = y2 + Rng::gauss( m_noisePos, m_noiseVar );
263  }
264  // ..and the uninformative ones
265  for ( unsigned int i=m_sizeUseful; i<m_size; i++ ) {
266  input(i) = Rng::gauss( m_noisePos, m_noiseVar );
267  }
268  }
269 
270 protected:
271  unsigned int m_size;
272  unsigned int m_sizeUseful;
273  unsigned int m_sizeNoise;
274  double m_noisePos;
275  double m_noiseVar;
276 };
277 
278 /// This class randomly fills a (hyper-)square with data points. Points which
279 /// happen to be within a (hyper-)circle centered in the square of a certain
280 /// radius get a positive class label. Noise on the labels can be added.
281 class CircleInSquare : public LabeledDataDistribution<RealVector, unsigned int>
282 {
283 public:
284  CircleInSquare( unsigned int dimensions = 2, double noiselevel = 0.0, bool class_prob_equal = false )
285  : m_dimensions( dimensions ),
286  m_noiselevel( noiselevel ),
287  m_lowerLimit( -1 ),
288  m_upperLimit( 1 ),
289  m_centerpoint( 0 ),
290  m_inner_radius2( 0.5*0.5 ),
291  m_outer_radius2( 0.5*0.5 ),
292  m_equal_class_prob( class_prob_equal )
293  { }
294 
295  /// allow for arbitrary box limits
296  void setLimits( double lower_limit, double upper_limit, double inner_radius, double outer_radius )
297  {
298  RANGE_CHECK( lower_limit < upper_limit );
299  RANGE_CHECK( inner_radius <= outer_radius );
300  RANGE_CHECK( 2*outer_radius <= upper_limit-lower_limit );
301  m_lowerLimit = lower_limit;
302  m_upperLimit = upper_limit;
303  m_centerpoint = (upper_limit-lower_limit)/2.0;
304  m_inner_radius2 = inner_radius*inner_radius;
305  m_outer_radius2 = outer_radius*outer_radius;
306  }
307 
308  void draw(RealVector& input, unsigned int& label)const
309  {
310  input.resize( m_dimensions );
311  double v, dist;
312 
313  if ( m_equal_class_prob ) { //each class has equal probability - this implementation is brute-force and gorgeously inefficient :/
314  bool this_label = Rng::coinToss();
315  label = ( this_label ? 1 : 0 );
316  if ( Rng::uni(0.0, 1.0) < m_noiselevel )
317  label = 1 - label;
318  if ( this_label ) {
319  do {
320  dist = 0.0;
321  for ( unsigned int i=0; i<m_dimensions; i++ ) {
322  v = Rng::uni( m_lowerLimit, m_upperLimit );
323  input(i) = v;
324  dist += (v-m_centerpoint)*(v-m_centerpoint);
325  }
326  } while( dist > m_inner_radius2 );
327  }
328  else {
329  do {
330  dist = 0.0;
331  for ( unsigned int i=0; i<m_dimensions; i++ ) {
332  v = Rng::uni( m_lowerLimit, m_upperLimit );
333  input(i) = v;
334  dist += (v-m_centerpoint)*(v-m_centerpoint);
335  }
336  } while( dist < m_outer_radius2 );
337  }
338  }
339  else { //equal probability to be anywhere in the cube
340  do {
341  dist = 0.0;
342  for ( unsigned int i=0; i<m_dimensions; i++ ) {
343  v = Rng::uni( m_lowerLimit, m_upperLimit );
344  input(i) = v;
345  dist += (v-m_centerpoint)*(v-m_centerpoint);
346  }
347  label = ( dist < m_inner_radius2 ? 1 : 0 );
348  if ( Rng::uni(0.0, 1.0) < m_noiselevel )
349  label = 1 - label;
350  } while( dist > m_inner_radius2 && dist < m_outer_radius2 );
351  }
352  }
353 
354 protected:
355  unsigned int m_dimensions;
356  double m_noiselevel;
357  double m_lowerLimit;
358  double m_upperLimit;
362  bool m_equal_class_prob; ///<if true, the probability to belong to either class is equal. if false, it is uniform over the cube.
363 };
364 
365 // This class randomly fills a 4x4 square in the 2D-plane with data points.
366 // Points in the lower left diagonal half are negative, points in the
367 // upper right diagonal half are positive. But additionally, all points
368 // in a circle located in the lower right quadrant are positive, effectively
369 // bulging the decision boundary inward. Noise on the labels can be added.
370 class DiagonalWithCircle : public LabeledDataDistribution<RealVector, unsigned int>
371 {
372 public:
373  DiagonalWithCircle( double radius = 1.0, double noise = 0.0 )
374  : m_radius2( radius*radius ),
375  m_noiselevel( noise )
376  { }
377 
378  void draw(RealVector& input, unsigned int& label)const
379  {
380  input.resize( 2 );
381  double x,y;
382  x = Rng::uni( 0, 4 ); //zero is left
383  y = Rng::uni( 0, 4 ); //zero is bottom
384  // assign label according to position w.r.t. the diagonal
385  if ( x+y < 4 )
386  label = 1;
387  else
388  label = 0;
389  // but if in the circle (even above diagonal), assign positive label
390  if ( (3-x)*(3-x) + (1-y)*(1-y) < m_radius2 )
391  label = 1;
392 
393  // add noise
394  if ( Rng::uni(0.0, 1.0) < m_noiselevel )
395  label = 1 - label;
396  input(0) = x;
397  input(1) = y;
398  }
399 
400 protected:
401  double m_radius2;
402  double m_noiselevel;
403 };
404 
405 
406 /// \brief Generates a set of normally distributed points
408 {
409 public:
410  /// \brief Generates a simple distribution with
411  NormalDistributedPoints(std::size_t dim): m_offset(dim,0){
412  RealMatrix covariance(dim,dim,0);
413  diag(covariance) = blas::repeat(1.0,dim);
414  m_dist.setCovarianceMatrix(covariance);
415  }
416  NormalDistributedPoints(RealMatrix const& covariance, RealVector const& offset)
417  :m_dist(covariance), m_offset(offset){
418  SIZE_CHECK(offset.size() == covariance.size1());
419  }
420  void draw(RealVector& input) const{
421  input.resize(m_offset.size());
422  noalias(input) = m_offset;
423  noalias(input) += m_dist(Rng::globalRng).first;
424  }
425 private:
427  RealVector m_offset;
428 };
429 
430 /// \brief Given a set of images, draws a set of image patches of a given size
431 class ImagePatches:public DataDistribution<RealVector>{
432 public:
434  Data<RealVector> images,
435  std::size_t imageWidth, std::size_t imageHeight,
436  std::size_t patchWidth, std::size_t patchHeight
437  ):m_images(images)
438  , m_imageWidth(imageWidth)
439  , m_imageHeight(imageHeight)
440  , m_patchWidth(patchWidth)
441  , m_patchHeight(patchHeight)
442  ,m_numImages(m_images.numberOfElements()){}
443 
444  void draw(RealVector& input) const{
445  //sample image
446  std::size_t imageNum = Rng::discrete(0,m_numImages-1);
447  Data<RealVector>::const_element_reference image = m_images.element(imageNum);
448  //draw the upper left corner of the image
449  std::size_t m_startX = Rng::discrete(0,m_imageWidth-m_patchWidth);
450  std::size_t m_startY = Rng::discrete(0,m_imageHeight-m_patchHeight);
451 
452 
453  //copy patch
454  input.resize(m_patchWidth * m_patchHeight);
455  std::size_t rowStart = m_startY * m_imageWidth + m_startX;
456  for (size_t y = 0; y < m_patchHeight; ++y){
457  for (size_t x = 0; x < m_patchWidth; ++x){
458  input(y * m_patchWidth + x) = image(rowStart+x);
459  }
460  rowStart += m_imageWidth;
461  }
462  }
463 private:
464  Data<RealVector> m_images;
465  std::size_t m_imageWidth;
466  std::size_t m_imageHeight;
467  std::size_t m_patchWidth;
468  std::size_t m_patchHeight;
469  std::size_t m_numImages;
470 };
471 
472 }
473 #endif