CVDatasetTools.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Tools for cross-validation
6  *
7  *
8  *
9  * \author O.Krause
10  * \date 2010-2012
11  *
12  *
13  * \par Copyright 1995-2015 Shark Development Team
14  *
15  * <BR><HR>
16  * This file is part of Shark.
17  * <http://image.diku.dk/shark/>
18  *
19  * Shark is free software: you can redistribute it and/or modify
20  * it under the terms of the GNU Lesser General Public License as published
21  * by the Free Software Foundation, either version 3 of the License, or
22  * (at your option) any later version.
23  *
24  * Shark is distributed in the hope that it will be useful,
25  * but WITHOUT ANY WARRANTY; without even the implied warranty of
26  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27  * GNU Lesser General Public License for more details.
28  *
29  * You should have received a copy of the GNU Lesser General Public License
30  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
31  *
32  */
33 //===========================================================================
34 
35 #ifndef SHARK_DATA_CVDATASETTOOLS_H
36 #define SHARK_DATA_CVDATASETTOOLS_H
37 
38 #include <shark/Data/Dataset.h>
40 #include <algorithm>
41 //
42 #include <shark/Data/DataView.h>
43 
44 #include <utility> //for std::pair
45 
46 namespace shark {
47 
48 
49 template<class DatasetTypeT>
50 class CVFolds {
51 public:
52  typedef DatasetTypeT DatasetType;
53  typedef typename DatasetType::IndexSet IndexSet;
54 
55  /// \brief Creates an empty set of folds.
56  CVFolds() {}
57  ///\brief partitions set in validation folds indicated by the second argument.
58  ///
59  /// The folds are given as the batch indices of the validation sets
61  DatasetType const &set,
62  std::vector<IndexSet> const &validationIndizes
63  ) : m_dataset(set),m_validationFolds(validationIndizes) {}
64 
66  DatasetType const &set,
67  std::vector<std::size_t> const &foldStart
68  ) : m_dataset(set){
69  for (std::size_t partition = 0; partition != foldStart.size(); partition++) {
70  std::size_t partitionSize = (partition+1 == foldStart.size()) ? set.numberOfBatches() : foldStart[partition+1];
71  partitionSize -= foldStart[partition];
72  //create the set with the indices of the validation set of the current partition
73  //also update the starting element
74  IndexSet validationIndizes(partitionSize);
75  for (std::size_t batch = 0; batch != partitionSize; ++batch) {
76  validationIndizes[batch]=batch+foldStart[partition];
77  }
78  m_validationFolds.push_back(validationIndizes);
79  }
80  }
81 
82  DatasetType training(std::size_t i) const {
83  SIZE_CHECK(i < size());
84 
85  return indexedSubset(m_dataset, trainingFoldIndices(i));
86  }
87  DatasetType validation(std::size_t i) const {
88  SIZE_CHECK(i < size());
89 
90  return indexedSubset(m_dataset,validationFoldIndices(i));
91  }
92 
93  ///\brief returns the indices that make up the i-th validation fold
94  IndexSet const &validationFoldIndices(std::size_t i)const {
95  SIZE_CHECK(i < size());
96  return m_validationFolds[i];
97  }
98 
99  IndexSet trainingFoldIndices(std::size_t i)const {
100  SIZE_CHECK(i < size());
101  IndexSet trainingFold;
102  detail::complement(m_validationFolds[i], m_dataset.numberOfBatches(), trainingFold);
103  return trainingFold;
104  }
105 
106  ///\brief Returns the number of folds of the dataset.
107  std::size_t size()const {
108  return m_validationFolds.size();
109  }
110 
111  //~ /// \brief Returns the overall number of elements in the partitioned dataset
112  //~ std::size_t numberOfElements() const {
113  //~ return m_foldElementStart[size()];
114  //~ }
115  //~ /// \brief Returns the overall number of elements in the i-th training fold
116  //~ std::size_t numberOfTrainingElements(std::size_t i) const {
117  //~ SIZE_CHECK(i < size());
118  //~ return m_datasetSize-m_validationFoldSizes[i];
119  //~ }
120  //~ /// \brief Returns the overall number of elements in the i-th valdiation fold
121  //~ std::size_t numberOfValidationElements(std::size_t i) const {
122  //~ SIZE_CHECK(i < size());
123  //~ return m_validationFoldSizes[i];
124  //~ }
125 
126  /// \brief Returns the dataset underying the folds
127  DatasetType const& dataset()const{
128  return m_dataset;
129  }
130 
131  /// \brief Returns the dataset underying the folds
132  DatasetType& dataset(){
133  return m_dataset;
134  }
135 
136 private:
137  DatasetType m_dataset;
138  std::vector<IndexSet> m_validationFolds;
139  std::size_t m_datasetSize;
140  std::vector<std::size_t> m_validationFoldSizes;
141 };
142 
143 
144 /// auxiliary typedef for createCVSameSizeBalanced and createCVFullyIndexed, stores location index in the first and partition index in the second
145 typedef std::pair< std::vector<std::size_t> , std::vector<std::size_t> > RecreationIndices;
146 
147 namespace detail {
148 
149 ///\brief Version of createCVSameSizeBalanced which works regardless of the label type
150 ///
151 /// Instead of a class label to interpret, this class uses a membership vector for every
152 /// class which members[k][i] returns the positon of the i-th member of class k in the set.
153 template<class I, class L>
155  LabeledData<I,L> &set,
156  std::size_t numberOfPartitions,
157  std::vector< std::vector<std::size_t> > members,
158  std::size_t batchSize,
159  RecreationIndices * cv_indices = NULL //if not NULL: the first vector stores location information, and
160  // the second the partition information. The i-th value of the
161  // first vector shows what the original position of the now i-th
162  // sample was. The i-th value of the second vector shows what
163  // partition that sample now belongs to.
164 ) {
165  std::size_t numInputs = set.numberOfElements();
166  std::size_t numClasses = members.size();
167 
168  //shuffle elements in members
169  DiscreteUniform< Rng::rng_type > uni(shark::Rng::globalRng) ;
170  for (std::size_t c = 0; c != numClasses; c++) {
171  std::random_shuffle(members[c].begin(), members[c].end(), uni);
172  }
173 
174  //calculate number of elements per validation subset in the new to construct container
175  std::size_t nn = numInputs / numberOfPartitions;
176  std::size_t leftOver = numInputs % nn;
177  std::vector<std::size_t> validationSize(numberOfPartitions,nn);
178  for (std::size_t partition = 0; partition != leftOver; partition++) {
179  validationSize[partition]++;
180  }
181 
182  //calculate the size of the batches for every validation part
183  std::vector<std::size_t> partitionStart;
184  std::vector<std::size_t> batchSizes;
185  std::size_t numBatches = batchPartitioning(validationSize,partitionStart,batchSizes,batchSize);
186 
187 
188  LabeledData<I,L> newSet(numBatches);//set of empty batches
189  DataView<LabeledData<I,L> > setView(set);//fast access to single elements of the original set
190  std::vector<std::size_t> validationSetStart = partitionStart;//current index for the batch of every fold
191  //partition classes into the validation subsets of newSet
192  std::size_t fold = 0;//current fold
193  std::vector<std::vector<std::size_t> > batchElements(numberOfPartitions);
194 
195  //initialize the list of position indices which can later be used to re-create the fold (via createCV(Fully)Indexed)
196  if ( cv_indices != NULL ) {
197  cv_indices->first.clear();
198  cv_indices->first.resize( numInputs );
199  cv_indices->second.clear();
200  cv_indices->second.resize( numInputs );
201  }
202 
203  size_t j = 0; //for recreation indices
204  for (std::size_t c = 0; c != numClasses; c++) {
205  for (std::size_t i = 0; i != members[c].size(); i++) {
206  std::size_t oldPos = members[c][i];
207  std::size_t batchNumber = validationSetStart[fold];
208 
209  batchElements[fold].push_back(oldPos);
210 
211  if ( cv_indices != NULL ) {
212  cv_indices->first[ j ] = oldPos; //store the position in which the (now) i-th sample previously resided
213  cv_indices->second[ j ] = fold; //store the partition to which the (now) i-th sample gets assigned
214  // old: //(*cv_indices)[ oldPos ] = fold; //store in vector to recreate partition if desired
215  }
216 
217  //if all elements for the current batch are found, create it
218  if (batchElements[fold].size() == batchSizes[batchNumber]) {
219  newSet.batch(validationSetStart[fold]) = subBatch(setView,batchElements[fold]);
220  batchElements[fold].clear();
221  ++validationSetStart[fold];
222  }
223 
224  fold = (fold+1) % numberOfPartitions;
225 
226  j++;
227  }
228  }
229  SHARK_ASSERT( j == numInputs );
230 
231  //swap old and new set
232  swap(set, newSet);
233 
234  //create folds
235  return CVFolds<LabeledData<I,L> >(set,partitionStart);
236 
237 }
238 }//namespace detail
239 
240 /**
241  * \ingroup shark_globals
242  *
243  * @{
244  */
245 
246 //! \brief Create a partition for cross validation
247 //!
248 //! The subset each training examples belongs to
249 //! is drawn independently and uniformly distributed.
250 //! For every partition, all but one subset form the
251 //! training set, while the remaining one is used for
252 //! validation. The partitions can be accessed using
253 //! getCVPartitionName
254 //!
255 //! \param set the input data for which the new partitions are created
256 //! \param numberOfPartitions number of partitions to create
257 //! \param batchSize maximum batch size
258 template<class I,class L>
260  std::size_t numberOfPartitions,
261  std::size_t batchSize=Data<I>::DefaultBatchSize) {
262  std::vector<std::size_t> indices(set.numberOfElements());
263  for (std::size_t i=0; i != set.numberOfElements(); i++)
264  indices[i] = Rng::discrete(0, numberOfPartitions - 1);
265  return createCVIndexed(set,numberOfPartitions,indices,batchSize);
266 }
267 
268 //! \brief Create a partition for cross validation
269 //!
270 //! Every subset contains (approximately) the same
271 //! number of elements. For every partition, all
272 //! but one subset form the training set, while the
273 //! remaining one is used for validation. The partitions
274 //! can be accessed using getCVPartitionName
275 //!
276 //! \param numberOfPartitions number of partitions to create
277 //! \param set the input data from which to draw the partitions
278 //! \param batchSize maximum batch size
279 template<class I,class L>
280 CVFolds<LabeledData<I,L> > createCVSameSize(LabeledData<I,L> &set,std::size_t numberOfPartitions,std::size_t batchSize = LabeledData<I,L>::DefaultBatchSize) {
281  std::size_t numInputs = set.numberOfElements();
282 
283  //calculate the number of validation examples for every partition
284  std::vector<std::size_t> validationSize(numberOfPartitions);
285  std::size_t inputsForValidation = numInputs / numberOfPartitions;
286  std::size_t leftOver = numInputs - inputsForValidation * numberOfPartitions;
287  for (std::size_t i = 0; i != numberOfPartitions; i++) {
288  std::size_t vs=inputsForValidation+(i<leftOver);
289  validationSize[i] =vs;
290  }
291 
292  //calculate the size of batches for every validation part and their total number
293  std::vector<std::size_t> partitionStart;
294  std::vector<std::size_t> batchSizes;
295  detail::batchPartitioning(validationSize,partitionStart,batchSizes,batchSize);
296 
297  set.repartition(batchSizes);
298  set.shuffle();
299 
300  CVFolds<LabeledData<I,L> > folds(set,partitionStart);
301  return folds;//set;
302 }
303 
304 
305 //! \brief Create a partition for cross validation
306 //!
307 //! Every subset contains (approximately) the same
308 //! number of elements. For every partition, all
309 //! but one subset form the training set, while the
310 //! remaining one is used for validation.
311 //!
312 //! \param numberOfPartitions number of partitions to create
313 //! \param set the input data from which to draw the partitions
314 //! \param batchSize maximum batch size
315 //! \param cv_indices if not NULL [default]: for each element, store the fold it is assigned to; this can be used to later/externally recreate the fold via createCVIndexed
316 template<class I>
319  std::size_t numberOfPartitions,
320  std::size_t batchSize=Data<I>::DefaultBatchSize,
321  RecreationIndices * cv_indices = NULL //if not NULL: for each element, store the fold it is assigned to; this can be used to later/externally recreate the fold via createCVIndexed
322 ){
324  std::size_t numInputs = setView.size();
325  std::size_t numClasses = numberOfClasses(set);
326 
327 
328  //find members of each class
329  std::vector< std::vector<std::size_t> > members(numClasses);
330  for (std::size_t i = 0; i != numInputs; i++) {
331  members[setView[i].label].push_back(i);
332  }
333  return detail::createCVSameSizeBalanced(set, numberOfPartitions, members, batchSize, cv_indices);
334 
335 }
336 
337 //! \brief Create a partition for cross validation without changing the dataset
338 //!
339 //! This method behaves similar to createCVIID
340 //! with the difference that batches are not reordered. Thus the batches
341 //! are only rearranged randomly in folds, but the dataset itself is not changed.
342 //!
343 //! \param numberOfPartitions number of partitions to create
344 //! \param set the input data from which to draw the partitions
345 template<class I, class L>
347  LabeledData<I,L> const& set,
348  std::size_t numberOfPartitions
349 ){
350  std::vector<std::size_t> indizes(set.numberOfBatches());
351  for(std::size_t i= 0; i != set.numberOfBatches(); ++i)
352  indizes[i] = i;
353  DiscreteUniform<Rng::rng_type> uni(Rng::globalRng);
354  shark::shuffle(indizes.begin(),indizes.end(), uni);
355 
356  typedef typename LabeledData<I,L>::IndexSet IndexSet;
357 
358  std::vector<IndexSet> folds;
359  std::size_t partitionSize = set.numberOfBatches()/numberOfPartitions;
360  std::size_t remainder = set.numberOfBatches() - partitionSize*numberOfPartitions;
361  std::vector<std::size_t>::iterator pos = indizes.begin();
362  for(std::size_t i = 0; i!= numberOfPartitions; ++i){
363  std::size_t size = partitionSize;
364  if(remainder> 0){
365  ++size;
366  --remainder;
367  }
368  folds.push_back(IndexSet(pos,pos+size));
369  pos+=size;
370  }
371  return CVFolds<LabeledData<I,L> >(set,folds);
372 }
373 
374 //! \brief Create a partition for cross validation from indices
375 //!
376 //! Create a partition from indices. The indices vector for each sample states of what
377 //! validation partition that sample should become a member. In other words, the index
378 //! maps a sample to a validation partition, meaning that it will become a part of the
379 //! training partition for all other folds.
380 //!
381 //! \param set partitions will be subsets of this set
382 //! \param numberOfPartitions number of partitions to create
383 //! \param indices partition indices of the examples in [0, ..., numberOfPartitions[.
384 //! \param batchSize maximum batch size
385 template<class I,class L>
387  LabeledData<I,L> &set,
388  std::size_t numberOfPartitions,
389  std::vector<std::size_t> indices,
390  std::size_t batchSize=Data<I>::DefaultBatchSize
391 ) {
392  std::size_t numInputs = set.numberOfElements();
393  SIZE_CHECK(indices.size() == numInputs);
394  SIZE_CHECK(numberOfPartitions == *std::max_element(indices.begin(),indices.end())+1);
395 
396  //calculate the size of validation partitions
397  std::vector<std::size_t> validationSize(numberOfPartitions,0);
398  for (std::size_t input = 0; input != numInputs; input++) {
399  validationSize[indices[input]]++;
400  }
401 
402  //calculate the size of batches for every validation part and their total number
403  std::vector<std::size_t> partitionStart;
404  std::vector<std::size_t> batchSizes;
405  std::size_t numBatches = detail::batchPartitioning(validationSize,partitionStart,batchSizes,batchSize);
406 
407  //construct a new set with the correct batch format from the old set
408  LabeledData<I,L> newSet(numBatches);
409  DataView<LabeledData<I,L> > setView(set); //fast access to single elements of the original set
410  std::vector<std::size_t> validationSetStart = partitionStart; //current index for the batch of every partition
411  std::vector<std::vector<std::size_t> > batchElements(numberOfPartitions);
412  for (std::size_t input = 0; input != numInputs; input++) {
413  std::size_t partition = indices[input];
414  batchElements[partition].push_back(input);
415 
416  //if all elements for the current batch are found, create it
417  std::size_t batchNumber = validationSetStart[partition];
418  if (batchElements[partition].size() == batchSizes[batchNumber]) {
419  newSet.batch(validationSetStart[partition]) = subBatch(setView,batchElements[partition]);
420  batchElements[partition].clear();
421  ++validationSetStart[partition];
422  }
423  }
424  swap(set, newSet);
425  //now we only need to create the subset itself
426  return CVFolds<LabeledData<I,L> >(set,partitionStart);
427 }
428 
429 
430 
431 //! \brief Create a partition for cross validation from indices for both ordering and partitioning.
432 //!
433 //! Create a partition from indices. There is one index vector assigning an order
434 //! to the samples, and another one assigning each sample to a validation partition.
435 //! That is, given a dataset set, and at the i-th processing step, this function puts
436 //! the order_indices[i]-th sample into the partition_indices[i]-th partition. The
437 //! order_indices part of the above procedure matters if both an inner and
438 //! outer partition are to be recreated: for the inner partition to be recreated, too,
439 //! the outer partition must be recreated in the same order, not just partitioned into
440 //! the same splits.
441 //!
442 //! \param set partitions will be subsets of this set
443 //! \param numberOfPartitions number of partitions to create
444 //! \param indices stores location index in the first and partition index in the second vector
445 //! \param batchSize maximum batch size
446 template<class I,class L>
448  LabeledData<I,L> &set,
449  std::size_t numberOfPartitions,
450  RecreationIndices indices,
451  std::size_t batchSize=Data<I>::DefaultBatchSize
452 ) {
453  std::size_t numInputs = set.numberOfElements();
454  SIZE_CHECK(indices.first.size() == numInputs);
455  SIZE_CHECK(indices.second.size() == numInputs);
456  SIZE_CHECK(numberOfPartitions == *std::max_element(indices.second.begin(),indices.second.end())+1);
457 
458  //calculate the size of validation partitions
459  std::vector<std::size_t> validationSize(numberOfPartitions,0);
460  for (std::size_t input = 0; input != numInputs; input++) {
461  validationSize[indices.second[input]]++;
462  }
463 
464  //calculate the size of batches for every validation part and their total number
465  std::vector<std::size_t> partitionStart;
466  std::vector<std::size_t> batchSizes;
467  std::size_t numBatches = detail::batchPartitioning(validationSize,partitionStart,batchSizes,batchSize);
468 
469  //construct a new set with the correct batch format from the old set
470  LabeledData<I,L> newSet(numBatches);
471  DataView<LabeledData<I,L> > setView(set); //fast access to single elements of the original set
472  std::vector<std::size_t> validationSetStart = partitionStart; //current index for the batch of every partition
473  std::vector<std::vector<std::size_t> > batchElements(numberOfPartitions);
474  for (std::size_t input = 0; input != numInputs; input++) {
475  std::size_t partition = indices.second[input]; //the second vector's contents indicate the partition to assign each sample to.
476  batchElements[partition].push_back( indices.first[input] ); //the first vector's contents indicate from what original position to get the next sample.
477 
478  //if all elements for the current batch are found, create it
479  std::size_t batchNumber = validationSetStart[partition];
480  if (batchElements[partition].size() == batchSizes[batchNumber]) {
481  newSet.batch(validationSetStart[partition]) = subBatch(setView,batchElements[partition]);
482  batchElements[partition].clear();
483  ++validationSetStart[partition];
484  }
485  }
486  swap(set, newSet);
487  //now we only need to create the subset itself
488  return CVFolds<LabeledData<I,L> >(set,partitionStart);
489 }
490 
491 
492 // much more to come...
493 
494 /** @}*/
495 }
496 #include "Impl/CVDatasetTools.inl"
497 #endif