WeightedDataset.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Weighted data sets for (un-)supervised learning.
6  *
7  *
8  * \par
9  * This file provides containers for data used by the models, loss
10  * functions, and learning algorithms (trainers). The reason for
11  * dedicated containers of this type is that data often need to be
12  * split into subsets, such as training and test data, or folds in
13  * cross-validation. The containers in this file provide memory
14  * efficient mechanisms for managing and providing such subsets.
15  * The speciality of these containers are that they are weighted.
16  *
17  *
18  *
19  * \author O. Krause
20  * \date 2014
21  *
22  *
23  * \par Copyright 1995-2015 Shark Development Team
24  *
25  * <BR><HR>
26  * This file is part of Shark.
27  * <http://image.diku.dk/shark/>
28  *
29  * Shark is free software: you can redistribute it and/or modify
30  * it under the terms of the GNU Lesser General Public License as published
31  * by the Free Software Foundation, either version 3 of the License, or
32  * (at your option) any later version.
33  *
34  * Shark is distributed in the hope that it will be useful,
35  * but WITHOUT ANY WARRANTY; without even the implied warranty of
36  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
37  * GNU Lesser General Public License for more details.
38  *
39  * You should have received a copy of the GNU Lesser General Public License
40  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
41  *
42  */
43 //===========================================================================
44 
45 #ifndef SHARK_DATA_WEIGHTED_DATASET_H
46 #define SHARK_DATA_WEIGHTED_DATASET_H
47 
48 #include <shark/Data/Dataset.h>
49 namespace shark {
50 
51 namespace detail{
52 template <class DataContainerT>
53 class BaseWeightedDataset : public ISerializable
54 {
55 private:
56  typedef BaseWeightedDataset<DataContainerT> self_type;
57 public:
58  typedef typename DataContainerT::element_type DataType;
59  typedef double WeightType;
60  typedef DataContainerT DataContainer;
61  typedef Data<WeightType> WeightContainer;
62  typedef typename DataContainer::IndexSet IndexSet;
63 
64  // TYPEDEFS fOR PAIRS
65  typedef WeightedDataPair<
66  DataType,
67  WeightType
68  > element_type;
69 
70  typedef typename Batch<element_type>::type batch_type;
71 
72  // TYPEDEFS FOR RANGES
73  typedef typename PairRangeType<
74  element_type,
75  typename DataContainer::element_range,
77  >::type element_range;
78  typedef typename PairRangeType<
79  element_type,
80  typename DataContainer::const_element_range,
82  >::type const_element_range;
83  typedef typename PairRangeType<
84  batch_type,
85  typename DataContainer::batch_range,
87  >::type batch_range;
88  typedef typename PairRangeType<
89  batch_type,
90  typename DataContainer::const_batch_range,
92  >::type const_batch_range;
93 
94  // TYPEDEFS FOR REFERENCES
95  typedef typename boost::range_reference<batch_range>::type batch_reference;
96  typedef typename boost::range_reference<const_batch_range>::type const_batch_reference;
97  typedef typename boost::range_reference<element_range>::type element_reference;
98  typedef typename boost::range_reference<const_element_range>::type const_element_reference;
99 
100  ///\brief Returns the range of elements.
101  ///
102  ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
103  ///element access via begin()/end() in which case data.elements() provides the correct interface
104  const_element_range elements()const{
105  return zipPairRange<element_type>(m_data.elements(),m_weights.elements());
106  }
107  ///\brief Returns therange of elements.
108  ///
109  ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
110  ///element access via begin()/end() in which case data.elements() provides the correct interface
111  element_range elements(){
112  return zipPairRange<element_type>(m_data.elements(),m_weights.elements());
113  }
114 
115  ///\brief Returns the range of batches.
116  ///
117  ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
118  ///element access via begin()/end() in which case data.elements() provides the correct interface
119  const_batch_range batches()const{
120  return zipPairRange<batch_type>(m_data.batches(),m_weights.batches());
121  }
122  ///\brief Returns the range of batches.
123  ///
124  ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
125  ///element access via begin()/end() in which case data.elements() provides the correct interface
126  batch_range batches(){
127  return zipPairRange<batch_type>(m_data.batches(),m_weights.batches());
128  }
129 
130  ///\brief Returns the number of batches of the set.
131  std::size_t numberOfBatches() const{
132  return m_data.numberOfBatches();
133  }
134  ///\brief Returns the total number of elements.
135  std::size_t numberOfElements() const{
136  return m_data.numberOfElements();
137  }
138 
139  ///\brief Check whether the set is empty.
140  bool empty() const{
141  return m_data.empty();
142  }
143 
144  ///\brief Access to the stored data points as a separate container.
145  DataContainer const& data() const{
146  return m_data;
147  }
148  ///\brief Access to the stored data points as a separate container.
149  DataContainer& data(){
150  return m_data;
151  }
152 
153  ///\brief Access to weights as a separate container.
154  WeightContainer const& weights() const{
155  return m_weights;
156  }
157  ///\brief Access to weights as a separate container.
158  WeightContainer& weights(){
159  return m_weights;
160  }
161 
162  // CONSTRUCTORS
163 
164  ///\brief Constructs an Empty data set.
165  BaseWeightedDataset()
166  {}
167 
168  ///\brief Create an empty set with just the correct number of batches.
169  ///
170  /// The user must initialize the dataset after that by himself.
171  BaseWeightedDataset(std::size_t numBatches)
172  : m_data(numBatches),m_weights(numBatches)
173  {}
174 
175  /// \brief Construtor using a single element as blueprint to create a dataset with a specified number of elements.
176  ///
177  /// Optionally the desired batch Size can be set
178  ///
179  ///@param size the new size of the container
180  ///@param element the blueprint element from which to create the Container
181  ///@param batchSize the size of the batches. if this is 0, the size is unlimited
182  BaseWeightedDataset(std::size_t size, element_type const& element, std::size_t batchSize)
183  : m_data(size,element.data,batchSize)
184  , m_weights(size,element.weight,batchSize)
185  {}
186 
187  ///\brief Construction from data and a dataset rpresnting the weights
188  ///
189  /// Beware that when calling this constructor the organization of batches must be equal in both
190  /// containers. This Constructor will not reorganize the data!
191  BaseWeightedDataset(DataContainer const& data, Data<WeightType> const& weights)
192  : m_data(data), m_weights(weights)
193  {
194  SHARK_CHECK(data.numberOfElements() == weights.numberOfElements(), "[ BaseWeightedDataset::WeightedUnlabeledData] number of data and number of weights must agree");
195 #ifndef DNDEBUG
196  for(std::size_t i = 0; i != data.numberOfBatches(); ++i){
197  SIZE_CHECK(shark::size(data.batch(i))==shark::size(weights.batch(i)));
198  }
199 #endif
200  }
201 
202  ///\brief Construction from data. All points get the same weight assigned
203  BaseWeightedDataset(DataContainer const& data, double weight)
204  : m_data(data), m_weights(data.numberOfBatches())
205  {
206  for(std::size_t i = 0; i != numberOfBatches(); ++i){
207  std::size_t batchSize = boost::size(m_data.batch(i));
208  m_weights.batch(i) = Batch<WeightType>::type(batchSize,weight);
209  }
210  }
211 
212 
213  // ELEMENT ACCESS
214  element_reference element(std::size_t i){
215  return element_reference(m_data.element(i),m_weights.element(i));
216  }
217  const_element_reference element(std::size_t i) const{
218  return const_element_reference(m_data.element(i),m_weights.element(i));
219  }
220 
221  // BATCH ACCESS
222  batch_reference batch(std::size_t i){
223  return batch_reference(m_data.batch(i),m_weights.batch(i));
224  }
225  const_batch_reference batch(std::size_t i) const{
226  return const_batch_reference(m_data.batch(i),m_weights.batch(i));
227  }
228 
229  // MISC
230 
231  /// from ISerializable
232  void read(InArchive& archive){
233  archive & m_data;
234  archive & m_weights;
235  }
236 
237  /// from ISerializable
238  void write(OutArchive& archive) const{
239  archive & m_data;
240  archive & m_weights;
241  }
242 
243  ///\brief This method makes the vector independent of all siblings and parents.
244  virtual void makeIndependent(){
245  m_weights.makeIndependent();
246  m_data.makeIndependent();
247  }
248 
249  ///\brief shuffles all elements in the entire dataset (that is, also across the batches)
250  virtual void shuffle(){
251  DiscreteUniform<Rng::rng_type> uni(Rng::globalRng);
252  shark::shuffle(this->elements().begin(),this->elements().end(), uni);
253  }
254 
255  void splitBatch(std::size_t batch, std::size_t elementIndex){
256  m_data.splitBatch(batch,elementIndex);
257  m_weights.splitBatch(batch,elementIndex);
258  }
259 
260  /// \brief Appends the contents of another data object to the end
261  ///
262  /// The batches are not copied but now referenced from both datasets. Thus changing the appended
263  /// dataset might change this one as well.
264  void append(self_type const& other){
265  m_data.append(other.m_data);
266  m_weights.append(other.m_weights);
267  }
268 
269 
270  ///\brief Reorders the batch structure in the container to that indicated by the batchSizes vector
271  ///
272  ///After the operation the container will contain batchSizes.size() batches with the i-th batch having size batchSize[i].
273  ///However the sum of all batch sizes must be equal to the current number of elements
274  template<class Range>
275  void repartition(Range const& batchSizes){
276  m_data.repartition(batchSizes);
277  m_weights.repartition(batchSizes);
278  }
279 
280  /// \brief Creates a vector with the batch sizes of every batch.
281  ///
282  /// This method can be used together with repartition to ensure
283  /// that two datasets have the same batch structure.
284  std::vector<std::size_t> getPartitioning()const{
285  return m_data.getPartitioning();
286  }
287 
288  friend void swap( self_type& a, self_type& b){
289  swap(a.m_data,b.m_data);
290  swap(a.m_weights,b.m_weights);
291  }
292 
293 
294  // SUBSETS
295 
296  ///\brief Fill in the subset defined by the list of indices.
297  void indexedSubset(IndexSet const& indices, self_type& subset) const{
298  m_data.indexedSubset(indices,subset.m_data);
299  m_weights.indexedSubset(indices,subset.m_weights);
300  }
301 
302  ///\brief Fill in the subset defined by the list of indices as well as its complement.
303  void indexedSubset(IndexSet const& indices, self_type& subset, self_type& complement)const{
304  IndexSet comp;
305  detail::complement(indices,m_data.numberOfBatches(),comp);
306  m_data.indexedSubset(indices,subset.m_data);
307  m_weights.indexedSubset(indices,subset.m_weights);
308  m_data.indexedSubset(comp,complement.m_data);
309  m_weights.indexedSubset(comp,complement.m_weights);
310  }
311 private:
312  DataContainer m_data; /// point data
313  WeightContainer m_weights; /// weight data
314 };
315 
316 }
317 
318 ///
319 /// \brief Weighted data set for unsupervised learning
320 ///
321 /// The WeightedUnlabeledData class extends UnlabeledData for the
322 /// representation of data. In addition it holds and provides access to the corresponding weights.
323 ///
324 /// WeightedUnlabeledData tries to mimic the underlying data as pairs of data points and weights.
325 /// this means that when accessing a batch by calling batch(i) or choosing one of the iterators
326 /// one access the input batch by batch(i).data and the weights by batch(i).weight
327 ///
328 ///this also holds true for single element access using operator(). Be aware, that direct access to element is
329 ///a linear time operation. So it is not advisable to iterate over the elements, but instead iterate over the batches.
330 template <class DataT>
331 class WeightedUnlabeledData : public detail::BaseWeightedDataset <UnlabeledData<DataT> >
332 {
333 private:
334  typedef WeightedUnlabeledData<DataT> self_type;
335  typedef detail::BaseWeightedDataset <UnlabeledData<DataT> > base_type;
336 public:
337  using base_type::data;
338  using base_type::weights;
339  typedef typename base_type::DataType DataType;
340  typedef typename base_type::WeightType WeightType;
341  typedef typename base_type::element_type element_type;
342  typedef DataT InputType;
343 
344  BOOST_STATIC_CONSTANT(std::size_t, DefaultBatchSize = UnlabeledData<DataT>::DefaultBatchSize);
345 
346  // CONSTRUCTORS
347 
348  ///\brief Empty data set.
350  {}
351 
352  ///\brief Create an empty set with just the correct number of batches.
353  ///
354  /// The user must initialize the dataset after that by himself.
355  WeightedUnlabeledData(std::size_t numBatches)
356  : base_type(numBatches)
357  {}
358 
359  /// \brief Construtor using a single element as blueprint to create a dataset with a specified number of elements.
360  ///
361  /// Optionally the desired batch Size can be set
362  ///
363  ///@param size the new size of the container
364  ///@param element the blueprint element from which to create the Container
365  ///@param batchSize the size of the batches. if this is 0, the size is unlimited
366  WeightedUnlabeledData(std::size_t size, element_type const& element, std::size_t batchSize = DefaultBatchSize)
367  : base_type(size,element,batchSize){}
368 
369  ///\brief Construction from data.
370  ///
371  /// Beware that when calling this constructor the organization of batches must be equal in both
372  /// containers. This Constructor will not reorganize the data!
374  : base_type(data,weights)
375  {}
376 
377  ///\brief Construction from data and a constant weight for all elements
378  WeightedUnlabeledData(UnlabeledData<DataType> const& data, double weight)
379  : base_type(data,weight)
380  {}
381 
382  //we additionally add the two below for compatibility with UnlabeledData
383 
384  ///\brief Access to the inputs as a separate container.
385  UnlabeledData<DataT> const& inputs() const{
386  return data();
387  }
388  ///\brief Access to the inputs as a separate container.
390  return data();
391  }
392 
393  ///\brief Splits the container into two independent parts. The left part remains in the container, the right is stored as return type
394  ///
395  ///Order of elements remain unchanged. The SharedVector is not allowed to be shared for
396  ///this to work.
397  self_type splice(std::size_t batch){
398  return self_type(data().splice(batch),weights().splice(batch));
399  }
400 
402  swap(static_cast<base_type&>(a),static_cast<base_type&>(b));
403  }
404 };
405 
406 ///brief Outstream of elements for weighted data.
407 template<class T>
408 std::ostream &operator << (std::ostream &stream, const WeightedUnlabeledData<T>& d) {
409  typedef typename WeightedUnlabeledData<T>::const_element_reference reference;
410  typename WeightedUnlabeledData<T>::const_element_range elements = d.elements();
411  BOOST_FOREACH(reference elem,elements)
412  stream << elem.weight << " [" << elem.data<<"]"<< "\n";
413  return stream;
414 }
415 
416 /// \brief creates a weighted unweighted data object from two ranges, representing data and weights
417 template<class DataRange, class WeightRange>
418 typename boost::disable_if<
419  boost::is_arithmetic<WeightRange>,
421  typename boost::range_value<DataRange>::type
422  >
423 >::type createUnlabeledDataFromRange(DataRange const& data, WeightRange const& weights, std::size_t batchSize = 0){
424  SHARK_CHECK(boost::size(data) == boost::size(weights),
425  "[createDataFromRange] number of data points and number of weights must agree");
426  typedef typename boost::range_value<DataRange>::type Data;
427 
428  if (batchSize == 0)
430 
432  shark::createUnlabeledDataFromRange(data,batchSize),
433  createDataFromRange(weights,batchSize)
434  );
435 }
436 
437 
438 ///
439 /// \brief Weighted data set for supervised learning
440 ///
441 /// The WeightedLabeledData class extends LabeledData for the
442 /// representation of data. In addition it holds and provides access to the corresponding weights.
443 ///
444 /// WeightedLabeledData tries to mimic the underlying data as pairs of data tuples(input,label) and weights.
445 /// this means that when accessing a batch by calling batch(i) or choosing one of the iterators
446 /// one access the databatch by batch(i).data and the weights by batch(i).weight. to access the points and labels
447 /// use batch(i).data.input and batch(i).data.label
448 ///
449 ///this also holds true for single element access using operator(). Be aware, that direct access to element is
450 ///a linear time operation. So it is not advisable to iterate over the elements, but instead iterate over the batches.
451 ///
452 /// It is possible to gains everal views on the set. one can either get access to inputs, labels and weights separately
453 /// or gain access to the unweighted dataset of inputs and labels. Additionally the sets support on-the-fly creation
454 /// of the (inputs,weights) subset for unsupervised weighted learning
455 template <class InputT, class LabelT>
456 class WeightedLabeledData : public detail::BaseWeightedDataset <LabeledData<InputT,LabelT> >
457 {
458 private:
459  typedef WeightedLabeledData<InputT,LabelT> self_type;
460  typedef detail::BaseWeightedDataset <LabeledData<InputT,LabelT> > base_type;
461 public:
462  typedef typename base_type::DataType DataType;
463  typedef typename base_type::WeightType WeightType;
464  typedef InputT InputType;
465  typedef LabelT LabelType;
466  typedef typename base_type::element_type element_type;
467 
468  using base_type::data;
469  using base_type::weights;
470 
471  BOOST_STATIC_CONSTANT(std::size_t, DefaultBatchSize = (LabeledData<InputT,LabelT>::DefaultBatchSize));
472 
473  // CONSTRUCTORS
474 
475  ///\brief Empty data set.
477  {}
478 
479  ///\brief Create an empty set with just the correct number of batches.
480  ///
481  /// The user must initialize the dataset after that by himself.
482  WeightedLabeledData(std::size_t numBatches)
483  : base_type(numBatches)
484  {}
485 
486  /// \brief Construtor using a single element as blueprint to create a dataset with a specified number of elements.
487  ///
488  /// Optionally the desired batch Size can be set
489  ///
490  ///@param size the new size of the container
491  ///@param element the blueprint element from which to create the Container
492  ///@param batchSize the size of the batches. if this is 0, the size is unlimited
493  WeightedLabeledData(std::size_t size, element_type const& element, std::size_t batchSize = DefaultBatchSize)
494  : base_type(size,element,batchSize){}
495 
496  ///\brief Construction from data.
497  ///
498  /// Beware that when calling this constructor the organization of batches must be equal in both
499  /// containers. This Constructor will not reorganize the data!
501  : base_type(data,weights)
502  {}
503 
504  ///\brief Construction from data and a constant weight for all elements
506  : base_type(data,weight)
507  {}
508 
509  ///\brief Access to the inputs as a separate container.
511  return data().inputs();
512  }
513  ///\brief Access to the inputs as a separate container.
515  return data().inputs();
516  }
517 
518  ///\brief Access to the labels as a separate container.
519  Data<LabelType> const& labels() const{
520  return data().labels();
521  }
522  ///\brief Access to the labels as a separate container.
524  return data().labels();
525  }
526 
527  /// \brief Constructs an WeightedUnlabeledData object for the inputs.
529  return WeightedUnlabeledData<InputType>(data().inputs(),weights());
530  }
531 
532  ///\brief Splits the container into two independent parts. The left part remains in the container, the right is stored as return type
533  ///
534  ///Order of elements remain unchanged. The SharedVector is not allowed to be shared for
535  ///this to work.
536  self_type splice(std::size_t batch){
537  return self_type(data().splice(batch),weights().splice(batch));
538  }
539 
540  friend void swap(self_type& a, self_type& b){
541  swap(static_cast<base_type&>(a),static_cast<base_type&>(b));
542  }
543 };
544 
545 ///brief Outstream of elements for weighted labeled data.
546 template<class T, class U>
547 std::ostream &operator << (std::ostream &stream, const WeightedLabeledData<T, U>& d) {
548  typedef typename WeightedLabeledData<T, U>::const_element_reference reference;
549  typename WeightedLabeledData<T, U>::const_element_range elements = d.elements();
550  BOOST_FOREACH(reference elem,elements)
551  stream << elem.weight <<" ("<< elem.data.label << " [" << elem.data.input<<"] )"<< "\n";
552  return stream;
553 }
554 
555 //Stuff for Dimensionality and querying of basic information
556 
557 template<class InputType>
559  double weightSum = 0;
560  for(std::size_t i = 0; i != dataset.numberOfBatches(); ++i){
561  weightSum += sum(dataset.batch(i).weight);
562  }
563  return weightSum;
564 }
565 template<class InputType, class LabelType>
567  double weightSum = 0;
568  for(std::size_t i = 0; i != dataset.numberOfBatches(); ++i){
569  weightSum += sum(dataset.batch(i).weight);
570  }
571  return weightSum;
572 }
573 
574 inline std::size_t numberOfClasses(WeightedUnlabeledData<unsigned int> const& labels){
575  return numberOfClasses(labels.data());
576 }
577 
578 ///\brief Returns the number of members of each class in the dataset.
579 inline std::vector<std::size_t> classSizes(WeightedUnlabeledData<unsigned int> const& labels){
580  return classSizes(labels.data());
581 }
582 
583 ///\brief Return the dimnsionality of points of a weighted dataset
584 template <class InputType>
586  return dataDimension(dataset.data());
587 }
588 
589 ///\brief Return the input dimensionality of a weighted labeled dataset.
590 template <class InputType, class LabelType>
592  return dataDimension(dataset.inputs());
593 }
594 
595 ///\brief Return the label/output dimensionality of a labeled dataset.
596 template <class InputType, class LabelType>
598  return dataDimension(dataset.labels());
599 }
600 ///\brief Return the number of classes (highest label value +1) of a classification dataset with unsigned int label encoding
601 template <class InputType>
603  return numberOfClasses(dataset.labels());
604 }
605 
606 ///\brief Returns the number of members of each class in the dataset.
607 template<class InputType, class LabelType>
608 inline std::vector<std::size_t> classSizes(WeightedLabeledData<InputType, LabelType> const& dataset){
609  return classSizes(dataset.labels());
610 }
611 
612 //creation of weighted datasets
613 
614 /// \brief creates a weighted unweighted data object from two ranges, representing data and weights
615 template<class InputRange,class LabelRange, class WeightRange>
616 typename boost::disable_if<
617  boost::is_arithmetic<WeightRange>,
619  typename boost::range_value<InputRange>::type,
620  typename boost::range_value<LabelRange>::type
621  >
622 >::type createLabeledDataFromRange(InputRange const& inputs, LabelRange const& labels, WeightRange const& weights, std::size_t batchSize = 0){
623  SHARK_CHECK(boost::size(inputs) == boost::size(labels),
624  "[createDataFromRange] number of data points and number of weights must agree");
625  SHARK_CHECK(boost::size(inputs) == boost::size(weights),
626  "[createDataFromRange] number of data points and number of weights must agree");
627  typedef typename boost::range_value<InputRange>::type InputType;
628  typedef typename boost::range_value<LabelRange>::type LabelType;
629 
630  if (batchSize == 0)
632 
634  createLabeledDataFromRange(inputs,labels,batchSize),
635  createDataFromRange(weights,batchSize)
636  );
637 }
638 
639 /// \brief Creates a bootstrap partition of a labeled dataset and returns it using weighting.
640 ///
641 /// Bootstrapping resamples the dataset by drawing a set of points with
642 /// replacement. Thus the sampled set will contain some points multiple times
643 /// and some points not at all. Bootstrapping is usefull to obtain unbiased
644 /// measurements of the mean and variance of an estimator.
645 ///
646 /// Optionally the size of the bootstrap (that is, the number of sampled points)
647 /// can be set. By default it is 0, which indicates that it is the same size as the original dataset.
648 template<class InputType, class LabelType>
650  LabeledData<InputType,LabelType> const& dataset,
651  std::size_t bootStrapSize = 0
652 ){
653  if(bootStrapSize == 0)
654  bootStrapSize = dataset.numberOfElements();
655 
656  WeightedLabeledData<InputType,LabelType> bootstrapSet(dataset,0.0);
657 
658  for(std::size_t i = 0; i != bootStrapSize; ++i){
659  std::size_t index = Rng::discrete(0,bootStrapSize-1);
660  bootstrapSet.element(index).weight += 1.0;
661  }
662  return bootstrapSet;
663 }
664 
665 /// \brief Creates a bootstrap partition of an unlabeled dataset and returns it using weighting.
666 ///
667 /// Bootstrapping resamples the dataset by drawing a set of points with
668 /// replacement. Thus the sampled set will contain some points multiple times
669 /// and some points not at all. Bootstrapping is usefull to obtain unbiased
670 /// measurements of the mean and variance of an estimator.
671 ///
672 /// Optionally the size of the bootstrap (that is, the number of sampled points)
673 /// can be set. By default it is 0, which indicates that it is the same size as the original dataset.
674 template<class InputType>
676  UnlabeledData<InputType> const& dataset,
677  std::size_t bootStrapSize = 0
678 ){
679  if(bootStrapSize == 0)
680  bootStrapSize = dataset.numberOfElements();
681 
682  WeightedUnlabeledData<InputType> bootstrapSet(dataset,0.0);
683 
684  for(std::size_t i = 0; i != bootStrapSize; ++i){
685  std::size_t index = Rng::discrete(0,bootStrapSize-1);
686  bootstrapSet.element(index).weight += 1.0;
687  }
688  return bootstrapSet;
689 }
690 
691 /** @*/
692 }
693 
694 #endif