Dataset.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Data for (un-)supervised learning.
6  *
7  *
8  * \par
9  * This file provides containers for data used by the models, loss
10  * functions, and learning algorithms (trainers). The reason for
11  * dedicated containers of this type is that data often need to be
12  * split into subsets, such as training and test data, or folds in
13  * cross-validation. The containers in this file provide memory
14  * efficient mechanisms for managing and providing such subsets.
15  *
16  *
17  *
18  *
19  * \author O. Krause, T. Glasmachers
20  * \date 2010-2014
21  *
22  *
23  * \par Copyright 1995-2015 Shark Development Team
24  *
25  * <BR><HR>
26  * This file is part of Shark.
27  * <http://image.diku.dk/shark/>
28  *
29  * Shark is free software: you can redistribute it and/or modify
30  * it under the terms of the GNU Lesser General Public License as published
31  * by the Free Software Foundation, either version 3 of the License, or
32  * (at your option) any later version.
33  *
34  * Shark is distributed in the hope that it will be useful,
35  * but WITHOUT ANY WARRANTY; without even the implied warranty of
36  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
37  * GNU Lesser General Public License for more details.
38  *
39  * You should have received a copy of the GNU Lesser General Public License
40  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
41  *
42  */
43 //===========================================================================
44 
45 #ifndef SHARK_DATA_DATASET_H
46 #define SHARK_DATA_DATASET_H
47 
48 #include <boost/foreach.hpp>
49 #include <boost/range/iterator_range.hpp>
50 #include <boost/range/algorithm/sort.hpp>
51 
52 #include <shark/Core/Exception.h>
53 #include <shark/Core/OpenMP.h>
55 #include <shark/Rng/GlobalRng.h>
56 #include "Impl/Dataset.inl"
57 
58 namespace shark {
59 
60 
61 ///
62 /// \brief Data container.
63 ///
64 /// The Data class is Shark's container for machine learning data.
65 /// This container (and its sub-classes) is used for input data,
66 /// labels, and model outputs.
67 ///
68 /// \par
69 /// The Data container organizes the data it holds in batches.
70 /// This means, that it tries to find a good data representation for a whole
71 /// set of, for example 100 data points, at the same time. If the type of data it stores
72 /// is for example RealVector, the batches of this type are RealMatrices. This is good because most often
73 /// operations on the whole matrix are faster than operations on the separate vectors.
74 /// Nearly all operations of the set have to be interpreted in terms of the batch. Therefore the iterator interface will
75 /// give access to the batches but not to single elements. For this separate element_iterators and const_element_iterators
76 /// can be used.
77 ///\par
78 /// There are a lot of these typedefs. The typical typedefs for containers like batch_type or iterator are chosen
79 /// as types for the batch interface. For accessing single elements, a different set of typedefs is in place. Thus instead of iterator
80 /// you must write element_iterator and instead of batch_type write element_type. Usually you should not use element_type except when
81 /// you want to actually copy the data. Instead use element_reference or const_element_reference. Note that these are proxy objects and not
82 /// actual references to element_type!
83 /// A short example for these typedefs:
84 ///\code
85 ///typedef Data<RealVector> Set;
86 /// Set data;
87 /// for(Set::element_iterator pos=data.elemBegin();pos!= data.elemEnd();++pos){
88 /// std::cout<<*pos<<" ";
89 /// Set::element_reference ref=*pos;
90 /// ref*=2;
91 /// std::cout<<*pos<<std::endl;
92 ///}
93 ///\endcode
94 ///When you write C++11 code, this is of course much simpler:
95 ///\code
96 /// Data<RealVector> data;
97 /// for(auto pos=data.elemBegin();pos!= data.elemEnd();++pos){
98 /// std::cout<<*pos<<" ";
99 /// auto ref=*pos;
100 /// ref*=2;
101 /// std::cout<<*pos<<std::endl;
102 ///}
103 ///\endcode
104 /// \par
105 /// Element wise accessing of elements is usually slower than accessing the batches. If possible, use direct batch access, or
106 /// at least use the iterator interface to iterate over all elements. Random access to single elements is linear time, so use it wisely.
107 /// Of course, when you want to use batches, you need to know the actual batch type. This depends on the actual type of the input.
108 /// here are the rules:
109 /// if the input is an arithmetic type like int or double, the result will be a vector of this
110 /// (i.e. double->RealVector or Int->IntVector).
111 /// For vectors the results are matrices as mentioned above. If the vector is sparse, so is the matrix.
112 /// And for everything else the batch type is just a std::vector of the type, so no optimization can be applied.
113 /// \par
114 /// When constructing the container the batchSize can be set. If it is not set by the user the default batchSize is chosen. A BatchSize of 0
115 /// corresponds to putting all data into a single batch. Beware that not only the data needs storage but also
116 /// the various models during computation. So the actual amount of space to compute a batch can greatly exceed the batch size.
117 ///
118 /// An additional feature of the Data class is that it can be used to create lazy subsets. So the batches of a dataset
119 /// can be shared between various instances of the data class without additional memory overhead.
120 ///
121 ///
122 ///\warning Be aware --especially for derived containers like LabeledData-- that the set does not enforce structural consistency.
123 /// When you change the structure of the data part for example by directly changing the size of the batches, the size of the labels is not
124 /// enforced to change accordingly. Also when creating subsets of a set changing the parent will change it's siblings and conversely. The programmer
125 /// needs to ensure structural integrity!
126 /// For example this is dangerous:
127 /// \code
128 /// void function(Data<unsigned int>& data){
129 /// Data<unsigned int> newData(...);
130 /// data=newData;
131 /// }
132 /// \endcode
133 /// When data was originally a labeledData object, and newData has a different batch structure than data, this will lead to structural inconsistencies.
134 /// When function is rewritten such that newData has the same structure as data, this code is perfectly fine. The best way to get around this problem is
135 /// by rewriting the code as:
136 /// \code
137 /// Data<unsigned int> function(){
138 /// Data<unsigned int> newData(...);
139 /// return newData;
140 /// }
141 /// \endcode
142 ///\todo expand docu
143 template <class Type>
144 class Data : public ISerializable
145 {
146 protected:
147  typedef detail::SharedContainer<Type> Container;
148  typedef Data<Type> self_type;
149 
150  Container m_data; ///< data
151 public:
152  /// \brief Defines the default batch size of the Container.
153  ///
154  /// Zero means: unlimited
155  BOOST_STATIC_CONSTANT(std::size_t, DefaultBatchSize = 256);
156 
157  typedef typename Container::BatchType batch_type;
158  typedef batch_type& batch_reference;
159  typedef batch_type const& const_batch_reference;
160 
161  typedef Type element_type;
164 
165  typedef std::vector<std::size_t> IndexSet;
166 
167  template <class T> friend bool operator == (const Data<T>& op1, const Data<T>& op2);
168  template <class InputT, class LabelT> friend class LabeledData;
169 
170 
171  // RANGES
172  typedef boost::iterator_range<typename Container::element_iterator> element_range;
173  typedef boost::iterator_range<typename Container::const_element_iterator> const_element_range;
174  typedef boost::iterator_range<typename Container::iterator> batch_range;
175  typedef boost::iterator_range<typename Container::const_iterator> const_batch_range;
176 
177 
178  ///\brief Returns the range of elements.
179  ///
180  ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
181  ///element access via begin()/end() in which case data.elements() provides the correct interface
182  const_element_range elements()const{
183  return const_element_range(m_data.elemBegin(),m_data.elemEnd());
184  }
185  ///\brief Returns therange of elements.
186  ///
187  ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
188  ///element access via begin()/end() in which case data.elements() provides the correct interface
189  element_range elements(){
190  return element_range(m_data.elemBegin(),m_data.elemEnd());
191  }
192 
193  ///\brief Returns the range of batches.
194  ///
195  ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
196  ///element access via begin()/end() in which case data.elements() provides the correct interface
197  const_batch_range batches()const{
198  return const_batch_range(m_data.begin(),m_data.end());
199  }
200  ///\brief Returns the range of batches.
201  ///
202  ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
203  ///element access via begin()/end() in which case data.elements() provides the correct interface
204  batch_range batches(){
205  return batch_range(m_data.begin(),m_data.end());
206  }
207 
208  ///\brief Returns the number of batches of the set.
209  std::size_t numberOfBatches() const{
210  return m_data.size();
211  }
212  ///\brief Returns the total number of elements.
213  std::size_t numberOfElements() const{
214  return m_data.numberOfElements();
215  }
216 
217  ///\brief Check whether the set is empty.
218  bool empty() const{
219  return m_data.empty();
220  }
221 
222  // ELEMENT ACCESS
223  element_reference element(std::size_t i){
224  return *(m_data.elemBegin()+i);
225  }
226  const_element_reference element(std::size_t i) const{
227  return *(m_data.elemBegin()+i);
228  }
229 
230  // BATCH ACCESS
231  batch_reference batch(std::size_t i){
232  return *(m_data.begin()+i);
233  }
234  const_batch_reference batch(std::size_t i) const{
235  return *(m_data.begin()+i);
236  }
237 
238  // CONSTRUCTORS
239 
240  ///\brief Constructor which constructs an empty set
241  Data(){ }
242 
243  ///\brief Construct a dataset with empty batches.
244  explicit Data(std::size_t numBatches) : m_data( numBatches )
245  { }
246 
247  ///\brief Construct a dataset with different batch sizes as a copy of another dataset
248  explicit Data(Data const& container, std::vector<std::size_t> batchSizes)
249  : m_data( container.m_data, batchSizes, true )
250  { }
251 
252  ///\brief Construction with size and a single element
253  ///
254  /// Optionally the desired batch Size can be set
255  ///
256  ///@param size the new size of the container
257  ///@param element the blueprint element from which to create the Container
258  ///@param batchSize the size of the batches. if this is 0, the size is unlimited
259  explicit Data(std::size_t size, element_type const& element, std::size_t batchSize = DefaultBatchSize)
260  : m_data(size,element,batchSize)
261  { }
262 
263  // MISC
264 
265  void read(InArchive& archive){
266  archive >> m_data;
267  }
268 
269  void write(OutArchive& archive) const{
270  archive << m_data;
271  }
272  ///\brief This method makes the vector independent of all siblings and parents.
273  virtual void makeIndependent(){
274  m_data.makeIndependent();
275  }
276 
277 
278  // METHODS TO ALTER BATCH STRUCTURE
279 
280  void splitBatch(std::size_t batch, std::size_t elementIndex){
281  m_data.splitBatch(m_data.begin()+batch,elementIndex);
282  }
283 
284  ///\brief Splits the container into two independent parts. The front part remains in the container, the back part is returned.
285  ///
286  ///Order of elements remain unchanged. The SharedVector is not allowed to be shared for
287  ///this to work.
288  self_type splice(std::size_t batch){
289  self_type right;
290  right.m_data=m_data.splice(m_data.begin()+batch);
291  return right;
292  }
293 
294  /// \brief Appends the contents of another data object to the end
295  ///
296  /// The batches are not copied but now referenced from both datasets. Thus changing the appended
297  /// dataset might change this one as well.
298  void append(self_type const& other){
299  m_data.append(other.m_data);
300  }
301 
302  void push_back(const_batch_reference batch){
303  m_data.push_back(batch);
304  }
305 
306  ///\brief Reorders the batch structure in the container to that indicated by the batchSizes vector
307  ///
308  ///After the operation the container will contain batchSizes.size() batchs with the i-th batch having size batchSize[i].
309  ///However the sum of all batch sizes must be equal to the current number of elements
310  template<class Range>
311  void repartition(Range const& batchSizes){
312  m_data.repartition(batchSizes);
313  }
314 
315  /// \brief Creates a vector with the batch sizes of every batch.
316  ///
317  /// This method can be used together with repartition to ensure
318  /// that two datasets have the same batch structure.
319  std::vector<std::size_t> getPartitioning()const{
320  return m_data.getPartitioning();
321  }
322 
323  // SUBSETS
324  ///\brief Fill in the subset defined by the list of indices.
325  void indexedSubset(IndexSet const& indices, self_type& subset) const{
326  subset.m_data=Container(m_data,indices);
327  }
328 
329  ///\brief Fill in the subset defined by the list of indices as well as its complement.
330  void indexedSubset(IndexSet const& indices, self_type& subset, self_type& complement) const{
331  IndexSet comp;
332  detail::complement(indices,m_data.size(),comp);
333  subset.m_data=Container(m_data,indices);
334  complement.m_data=Container(m_data,comp);
335  }
336 
337  friend void swap(Data& a, Data& b){
338  swap(a.m_data,b.m_data);
339  }
340 };
341 
342 /**
343  * \ingroup shark_globals
344  * @{
345  */
346 
347 /// Outstream of elements.
348 template<class T>
349 std::ostream &operator << (std::ostream &stream, const Data<T>& d) {
350  typedef typename Data<T>::const_element_reference reference;
351  typename Data<T>::const_element_range elements = d.elements();
352  BOOST_FOREACH(reference elem,elements)
353  stream << elem << "\n";
354  return stream;
355 }
356 /** @} */
357 
358 /// \brief Data set for unsupervised learning.
359 ///
360 /// The UnlabeledData class is basically a standard Data container
361 /// with the special interpretation of its data point being
362 /// "inputs" to a learning algorithm.
363 template <class InputT>
364 class UnlabeledData : public Data<InputT>
365 {
366 public:
367  typedef InputT element_type;
370  typedef element_type InputType;
371  typedef detail::SharedContainer<InputT> InputContainer;
372 
373 protected:
374  using base_type::m_data;
375 public:
376 
377  ///\brief Constructor.
379  { }
380 
381  ///\brief Construction from data.
383  : base_type(points)
384  { }
385 
386  ///\brief Construction with size and a single element
387  ///
388  /// Optionally the desired batch Size can be set
389  ///
390  ///@param size the new size of the container
391  ///@param element the blueprint element from which to create the Container
392  ///@param batchSize the size of the batches. if this is 0, the size is unlimited
393  UnlabeledData(std::size_t size, element_type const& element, std::size_t batchSize = base_type::DefaultBatchSize)
394  : base_type(size,element,batchSize)
395  { }
396 
397  ///\brief Create an empty set with just the correct number of batches.
398  ///
399  /// The user must initialize the dataset after that by himself.
400  UnlabeledData(std::size_t numBatches)
401  : base_type(numBatches)
402  { }
403 
404  ///\brief Construct a dataset with different batch sizes. it is a copy of the other dataset
405  UnlabeledData(UnlabeledData const& container, std::vector<std::size_t> batchSizes)
406  :base_type(container,batchSizes){}
407 
408  /// \brief we allow assignment from Data.
409  self_type operator=(Data<InputT> const& data){
410  static_cast<Data<InputT>& >(*this) = data;
411  return *this;
412  }
413 
414  ///\brief Access to the base_type class as "inputs".
415  ///
416  /// Added for consistency with the LabeledData::labels() method.
417  self_type& inputs(){
418  return *this;
419  }
420 
421  ///\brief Access to the base_type class as "inputs".
422  ///
423  /// Added for consistency with the LabeledData::labels() method.
424  self_type const& inputs() const{
425  return *this;
426  }
427 
428  ///\brief Splits the container in two independent parts. The left part remains in the container, the right is stored as return type
429  ///
430  ///Order of elements remain unchanged. The SharedVector is not allowed to be shared for
431  ///this to work.
432  self_type splice(std::size_t batch){
433  self_type right;
434  right.m_data=m_data.splice(m_data.begin()+batch);
435  return right;
436  }
437 
438  ///\brief shuffles all elements in the entire dataset (that is, also across the batches)
439  virtual void shuffle(){
440  DiscreteUniform<Rng::rng_type> uni(Rng::globalRng);
441  shark::shuffle(this->elements().begin(),this->elements().end(), uni);
442  }
443 };
444 
445 ///
446 /// \brief Data set for supervised learning.
447 ///
448 /// The LabeledData class extends UnlabeledData for the
449 /// representation of inputs. In addition it holds and
450 /// provides access to the corresponding labels.
451 ///
452 /// LabeledData tries to mimic the underlying data as pairs of input and label data.
453 /// this means that when accessing a batch by calling batch(i) or choosing one of the iterators
454 /// one access the input batch by batch(i).input and the labels by batch(i).label
455 ///
456 ///this also holds true for single element access using operator(). Be aware, that direct access to an element is
457 ///a linear time operation. So it is not advisable to iterate over the elements, but instead iterate over the batches.
458 template <class InputT, class LabelT>
460 {
461 protected:
463 public:
464  typedef InputT InputType;
465  typedef LabelT LabelType;
469 
470  BOOST_STATIC_CONSTANT(std::size_t, DefaultBatchSize = InputContainer::DefaultBatchSize);
471 
472  // TYPEDEFS fOR PAIRS
473  typedef DataBatchPair<
474  typename Batch<InputType>::type,
475  typename Batch<LabelType>::type
476  > batch_type;
477 
478  typedef DataPair<
479  InputType,
480  LabelType
481  > element_type;
482 
483  // TYPEDEFS FOR RANGES
484  typedef typename PairRangeType<
485  element_type,
488  >::type element_range;
489  typedef typename PairRangeType<
490  element_type,
494  typedef typename PairRangeType<
495  batch_type,
498  >::type batch_range;
499  typedef typename PairRangeType<
500  batch_type,
504 
505  // TYPEDEFS FOR REFERENCES
506  typedef typename boost::range_reference<batch_range>::type batch_reference;
507  typedef typename boost::range_reference<const_batch_range>::type const_batch_reference;
508  typedef typename boost::range_reference<element_range>::type element_reference;
509  typedef typename boost::range_reference<const_element_range>::type const_element_reference;
510 
511  ///\brief Returns the range of elements.
512  ///
513  ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
514  ///element access via begin()/end() in which case data.elements() provides the correct interface
515  const_element_range elements()const{
516  return zipPairRange<element_type>(m_data.elements(),m_label.elements());
517  }
518  ///\brief Returns therange of elements.
519  ///
520  ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
521  ///element access via begin()/end() in which case data.elements() provides the correct interface
522  element_range elements(){
523  return zipPairRange<element_type>(m_data.elements(),m_label.elements());
524  }
525 
526  ///\brief Returns the range of batches.
527  ///
528  ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
529  ///element access via begin()/end() in which case data.elements() provides the correct interface
530  const_batch_range batches()const{
531  return zipPairRange<batch_type>(m_data.batches(),m_label.batches());
532  }
533  ///\brief Returns the range of batches.
534  ///
535  ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
536  ///element access via begin()/end() in which case data.elements() provides the correct interface
537  batch_range batches(){
538  return zipPairRange<batch_type>(m_data.batches(),m_label.batches());
539  }
540 
541  ///\brief Returns the number of batches of the set.
542  std::size_t numberOfBatches() const{
543  return m_data.numberOfBatches();
544  }
545  ///\brief Returns the total number of elements.
546  std::size_t numberOfElements() const{
547  return m_data.numberOfElements();
548  }
549 
550  ///\brief Check whether the set is empty.
551  bool empty() const{
552  return m_data.empty();
553  }
554 
555  ///\brief Access to inputs as a separate container.
556  InputContainer const& inputs() const{
557  return m_data;
558  }
559  ///\brief Access to inputs as a separate container.
560  InputContainer& inputs(){
561  return m_data;
562  }
563 
564  ///\brief Access to labels as a separate container.
565  LabelContainer const& labels() const{
566  return m_label;
567  }
568  ///\brief Access to labels as a separate container.
569  LabelContainer& labels(){
570  return m_label;
571  }
572 
573  // CONSTRUCTORS
574 
575  ///\brief Empty data set.
577  {}
578 
579  ///\brief Create an empty set with just the correct number of batches.
580  ///
581  /// The user must initialize the dataset after that by himself.
582  LabeledData(std::size_t numBatches)
583  : m_data(numBatches),m_label(numBatches)
584  {}
585 
586  ///
587  /// Optionally the desired batch Size can be set
588  ///
589  ///@param size the new size of the container
590  ///@param element the blueprint element from which to create the Container
591  ///@param batchSize the size of the batches. if this is 0, the size is unlimited
592  LabeledData(std::size_t size, element_type const& element, std::size_t batchSize = DefaultBatchSize)
593  : m_data(size,element.input,batchSize),
594  m_label(size,element.label,batchSize)
595  {}
596 
597  ///\brief Construction from data.
598  ///
599  /// Beware that when calling this constructor the organization of batches must be equal in both
600  /// containers. This Constructor will not split the data!
601  LabeledData(Data<InputType> const& inputs, Data<LabelType> const& labels)
602  : m_data(inputs), m_label(labels)
603  {
604  SHARK_CHECK(inputs.numberOfElements() == labels.numberOfElements(), "[LabeledData::LabeledData] number of inputs and number of labels must agree");
605 #ifndef DNDEBUG
606  for(std::size_t i = 0; i != inputs.numberOfBatches(); ++i){
607  SIZE_CHECK(shark::size(inputs.batch(i))==shark::size(labels.batch(i)));
608  }
609 #endif
610  }
611  // ELEMENT ACCESS
612  element_reference element(std::size_t i){
613  return element_reference(m_data.element(i),m_label.element(i));
614  }
615  const_element_reference element(std::size_t i) const{
616  return const_element_reference(m_data.element(i),m_label.element(i));
617  }
618 
619  // BATCH ACCESS
620  batch_reference batch(std::size_t i){
621  return batch_reference(m_data.batch(i),m_label.batch(i));
622  }
623  const_batch_reference batch(std::size_t i) const{
624  return const_batch_reference(m_data.batch(i),m_label.batch(i));
625  }
626 
627  // MISC
628 
629  /// from ISerializable
630  void read(InArchive& archive){
631  archive & m_data;
632  archive & m_label;
633  }
634 
635  /// from ISerializable
636  void write(OutArchive& archive) const{
637  archive & m_data;
638  archive & m_label;
639  }
640 
641  ///\brief This method makes the vector independent of all siblings and parents.
642  virtual void makeIndependent(){
643  m_label.makeIndependent();
644  m_data.makeIndependent();
645  }
646 
647  ///\brief shuffles all elements in the entire dataset (that is, also across the batches)
648  virtual void shuffle(){
649  DiscreteUniform<Rng::rng_type> uni(Rng::globalRng);
650  shark::shuffle(this->elements().begin(),this->elements().end(), uni);
651  }
652 
653  void splitBatch(std::size_t batch, std::size_t elementIndex){
654  m_data.splitBatch(batch,elementIndex);
655  m_label.splitBatch(batch,elementIndex);
656  }
657 
658  ///\brief Splits the container into two independent parts. The left part remains in the container, the right is stored as return type
659  ///
660  ///Order of elements remain unchanged. The SharedVector is not allowed to be shared for
661  ///this to work.
662  self_type splice(std::size_t batch){
663  return self_type(m_data.splice(batch),m_label.splice(batch));
664  }
665 
666  /// \brief Appends the contents of another data object to the end
667  ///
668  /// The batches are not copied but now referenced from both datasets. Thus changing the appended
669  /// dataset might change this one as well.
670  void append(self_type const& other){
671  m_data.append(other.m_data);
672  m_label.append(other.m_label);
673  }
674 
675  void push_back(
676  typename Batch<InputType>::type const& inputs,
677  typename Batch<LabelType>::type const& labels
678  ){
679  m_data.push_back(inputs);
680  m_label.push_back(labels);
681  }
682 
683  void push_back(
684  const_batch_reference batch
685  ){
686  push_back(batch.inputs,batch.labels);
687  }
688 
689 
690  ///\brief Reorders the batch structure in the container to that indicated by the batchSizes vector
691  ///
692  ///After the operation the container will contain batchSizes.size() batches with the i-th batch having size batchSize[i].
693  ///However the sum of all batch sizes must be equal to the current number of elements
694  template<class Range>
695  void repartition(Range const& batchSizes){
696  m_data.repartition(batchSizes);
697  m_label.repartition(batchSizes);
698  }
699 
700  /// \brief Creates a vector with the batch sizes of every batch.
701  ///
702  /// This method can be used together with repartition to ensure
703  /// that two datasets have the same batch structure.
704  std::vector<std::size_t> getPartitioning()const{
705  return m_data.getPartitioning();
706  }
707 
708  friend void swap(LabeledData& a, LabeledData& b){
709  swap(a.m_data,b.m_data);
710  swap(a.m_label,b.m_label);
711  }
712 
713 
714  // SUBSETS
715 
716  ///\brief Fill in the subset defined by the list of indices.
717  void indexedSubset(IndexSet const& indices, self_type& subset) const{
718  m_data.indexedSubset(indices,subset.m_data);
719  m_label.indexedSubset(indices,subset.m_label);
720  }
721 
722  ///\brief Fill in the subset defined by the list of indices as well as its complement.
723  void indexedSubset(IndexSet const& indices, self_type& subset, self_type& complement)const{
724  IndexSet comp;
725  detail::complement(indices,m_data.numberOfBatches(),comp);
726  m_data.indexedSubset(indices,subset.m_data);
727  m_label.indexedSubset(indices,subset.m_label);
728  m_data.indexedSubset(comp,complement.m_data);
729  m_label.indexedSubset(comp,complement.m_label);
730  }
731 protected:
732  InputContainer m_data; /// point data
733  LabelContainer m_label; /// label data
734 };
735 
736 /// specialized template for classification with unsigned int labels
738 
739 /// specialized template for regression with RealVector labels
741 
742 /// specialized template for classification with unsigned int labels and sparse data
744 
745 template<class Functor, class T>
748 };
749 
750 
751 /**
752  * \addtogroup shark_globals
753  * @{
754  */
755 
756 /// \brief creates a data object from a range of elements
757 template<class Range>
759 createDataFromRange(Range const& inputs, std::size_t maximumBatchSize = 0){
760  typedef typename boost::range_value<Range const>::type Input;
761  typedef typename boost::range_iterator<Range const>::type Iterator;
762 
763  if (maximumBatchSize == 0)
764  maximumBatchSize = Data<Input>::DefaultBatchSize;
765 
766  std::size_t numPoints = shark::size(inputs);
767  //first determine the optimal number of batches as well as batch size
768  std::size_t batches = numPoints / maximumBatchSize;
769  if(numPoints > batches*maximumBatchSize)
770  ++batches;
771  std::size_t optimalBatchSize=numPoints/batches;
772  std::size_t remainder = numPoints-batches*optimalBatchSize;
773  Data<Input> data(batches);
774 
775  //now create the batches taking the remainder into account
776  Iterator start= boost::begin(inputs);
777  for(std::size_t i = 0; i != batches; ++i){
778  std::size_t size = (i<remainder)?optimalBatchSize+1:optimalBatchSize;
779  Iterator end = start+size;
780  data.batch(i) = createBatch<Input>(
781  boost::make_iterator_range(start,end)
782  );
783  start = end;
784  }
785 
786  return data;
787 }
788 
789 /// \brief creates a data object from a range of elements
790 template<class Range>
792 createUnlabeledDataFromRange(Range const& inputs, std::size_t maximumBatchSize = 0){
793  return createDataFromRange(inputs,maximumBatchSize);
794 }
795 /// \brief creates a labeled data object from two ranges, representing inputs and labels
796 template<class Range1, class Range2>
798  typename boost::range_value<Range1>::type,
799  typename boost::range_value<Range2>::type
800 > createLabeledDataFromRange(Range1 const& inputs, Range2 const& labels, std::size_t batchSize = 0){
801  SHARK_CHECK(boost::size(inputs) == boost::size(labels),
802  "[createDataFromRange] number of inputs and number of labels must agree");
803  typedef typename boost::range_value<Range1>::type Input;
804  typedef typename boost::range_value<Range2>::type Label;
805 
806  if (batchSize == 0)
808 
810  createDataFromRange(inputs,batchSize),
811  createDataFromRange(labels,batchSize)
812  );
813 }
814 
815 ///brief Outstream of elements for labeled data.
816 template<class T, class U>
817 std::ostream &operator << (std::ostream &stream, const LabeledData<T, U>& d) {
818  typedef typename LabeledData<T, U>::const_element_reference reference;
819  typename LabeledData<T, U>::const_element_range elements = d.elements();
820  BOOST_FOREACH(reference elem,elements)
821  stream << elem.input << " [" << elem.label <<"]"<< "\n";
822  return stream;
823 }
824 
825 
826 // FUNCTIONS FOR DIMENSIONALITY
827 
828 
829 ///\brief Return the number of classes of a set of class labels with unsigned int label encoding
830 inline unsigned int numberOfClasses(Data<unsigned int> const& labels){
831  unsigned int classes = 0;
832  for(std::size_t i = 0; i != labels.numberOfBatches(); ++i){
833  classes = std::max(classes,*std::max_element(labels.batch(i).begin(),labels.batch(i).end()));
834  }
835  return classes+1;
836 }
837 
838 ///\brief Returns the number of members of each class in the dataset.
839 inline std::vector<std::size_t> classSizes(Data<unsigned int> const& labels){
840  std::vector<std::size_t> classCounts(numberOfClasses(labels),0u);
841  for(std::size_t i = 0; i != labels.numberOfBatches(); ++i){
842  std::size_t batchSize = size(labels.batch(i));
843  for(std::size_t j = 0; j != batchSize; ++j){
844  classCounts[labels.batch(i)(j)]++;
845  }
846  }
847  return classCounts;
848 }
849 
850 ///\brief Return the dimensionality of a dataset.
851 template <class InputType>
852 std::size_t dataDimension(Data<InputType> const& dataset){
853  SHARK_ASSERT(dataset.numberOfElements() > 0);
854  return dataset.element(0).size();
855 }
856 
857 ///\brief Return the input dimensionality of a labeled dataset.
858 template <class InputType, class LabelType>
860  return dataDimension(dataset.inputs());
861 }
862 
863 ///\brief Return the label/output dimensionality of a labeled dataset.
864 template <class InputType, class LabelType>
866  return dataDimension(dataset.labels());
867 }
868 ///\brief Return the number of classes (highest label value +1) of a classification dataset with unsigned int label encoding
869 template <class InputType>
871  return numberOfClasses(dataset.labels());
872 }
873 ///\brief Returns the number of members of each class in the dataset.
874 template<class InputType, class LabelType>
875 inline std::vector<std::size_t> classSizes(LabeledData<InputType, LabelType> const& dataset){
876  return classSizes(dataset.labels());
877 }
878 
879 
880 //subsetting
881 template<class DatasetT>
882 DatasetT indexedSubset(
883  DatasetT const& dataset,
884  typename DatasetT::IndexSet const& indices
885 ){
886  DatasetT subset;
887  dataset.indexedSubset(indices,subset);
888  return subset;
889 }
890 ///\brief Fill in the subset of batches [start,...,size+start[.
891 template<class DatasetT>
892 DatasetT rangeSubset(DatasetT const& dataset, std::size_t start, std::size_t end){
893  typename DatasetT::IndexSet indices;
894  detail::range(end-start, start, indices);
895  return indexedSubset(dataset,indices);
896 }
897 ///\brief Fill in the subset of batches [0,...,size[.
898 template<class DatasetT>
899 DatasetT rangeSubset(DatasetT const& dataset, std::size_t size){
900  return rangeSubset(dataset,size,0);
901 }
902 
903 // TRANSFORMATION
904 ///\brief Transforms a dataset using a Functor f and returns the transformed result.
905 ///
906 /// this version is used, when the Functor supports only element-by-element transformations
907 template<class T,class Functor>
908 typename boost::lazy_disable_if<
911 >::type
912 transform(Data<T> const& data, Functor f){
913  typedef typename detail::TransformedDataElement<Functor,T>::type ResultType;
914  int batches = (int) data.numberOfBatches();
915  Data<ResultType> result(batches);
916  SHARK_PARALLEL_FOR(int i = 0; i < batches; ++i)
917  result.batch(i)= createBatch<T>(boost::adaptors::transform(data.batch(i), f));
918  return result;
919 }
920 
921 ///\brief Transforms a dataset using a Functor f and returns the transformed result.
922 ///
923 /// this version is used, when the Functor supports batch-by-batch transformations
924 template<class T,class Functor>
925 typename boost::lazy_enable_if<
926  CanBeCalled<Functor,typename Data<T>::batch_type>,
928 >::type
929 transform(Data<T> const& data, Functor const& f){
930  typedef typename detail::TransformedDataElement<Functor,T>::type ResultType;
931  int batches = (int) data.numberOfBatches();
932  Data<ResultType> result(batches);
933  SHARK_PARALLEL_FOR(int i = 0; i < batches; ++i)
934  result.batch(i)= f(data.batch(i));
935  return result;
936 }
937 
938 ///\brief Transforms the inputs of a dataset and return the transformed result.
939 template<class I,class L, class Functor>
941 transformInputs(LabeledData<I,L> const& data, Functor const& f){
943  return DatasetType(transform(data.inputs(),f),data.labels());
944 }
945 ///\brief Transforms the labels of a dataset and returns the transformed result.
946 template<class I,class L, class Functor>
948 transformLabels(LabeledData<I,L> const& data, Functor const& f){
950  return DatasetType(data.inputs(),transform(data.labels(),f));
951 }
952 
953 ///\brief Creates a copy o a dataset selecting only a certain set of features.
954 template<class FeatureSet>
955 Data<RealVector> selectFeatures(Data<RealVector> const& data,FeatureSet const& features){
956  return transform(data,detail::SelectFeatures<FeatureSet>(features));
957 }
958 
959 template<class T, class FeatureSet>
961  return transformInputs(data, detail::SelectFeatures<FeatureSet>(features));
962 }
963 
964 
965 
966 /// \brief Removes the last part of a given dataset and returns a new split containing the removed elements
967 ///
968 /// For this operation, the dataset is not allowed to be shared.
969 /// \brief data The dataset which should be splited
970 /// \brief index the first element to be split
971 /// \returns the set which contains the splitd element (right part of the given set)
972 template<class DatasetT>
973 DatasetT splitAtElement(DatasetT& data, std::size_t elementIndex){
974  SIZE_CHECK(elementIndex<=data.numberOfElements());
975 
976  std::size_t batchPos = 0;
977  std::size_t batchStart = 0;
978  while(batchStart + boost::size(data.batch(batchPos)) < elementIndex){
979  batchStart += boost::size(data.batch(batchPos));
980  ++batchPos;
981  };
982  std::size_t splitPoint = elementIndex-batchStart;
983  if(splitPoint != 0){
984  data.splitBatch(batchPos,splitPoint);
985  ++batchPos;
986  }
987 
988  return data.splice(batchPos);
989 }
990 
991 
992 ///\brief reorders the dataset such, that points are grouped by labels
993 ///
994 /// The elements are not only reordered but the batches are also resized such, that every batch
995 /// only contains elements of one class. This method must be used in order to use binarySubproblem.
996 template<class I>
998  std::vector<std::size_t > classCounts = classSizes(data);
999  std::vector<std::size_t > partitioning;//new, optimal partitioning of the data according to the batch sizes
1000  std::vector<std::size_t > classStart;//at which batch the elements of the class are starting
1001  detail::batchPartitioning(classCounts, classStart, partitioning, batchSize);
1002 
1003  data.repartition(partitioning);
1004 
1005  // Now place examples into the batches reserved for their class...
1006 
1007  // The following line does the job in principle but it crashes with clang on the mac:
1008 // boost::sort(data.elements());//todo we are lying here, use bidirectional iterator sort.
1009 
1010  // The following fixes the issue. As an aside it is even linear time:
1011  std::vector<std::size_t> bat = classStart; // batch index until which the class is already filled in
1012  std::vector<std::size_t> idx(classStart.size(), 0); // index within the batch until which the class is already filled in
1013  unsigned int c = 0; // current class in whose batch space we operate
1014  typedef typename Batch<I>::type InputBatchType;
1015  typedef typename Batch<unsigned int>::type LabelBatchType;
1016  for (std::size_t b=0; b<data.numberOfBatches(); b++)
1017  {
1018  // update class range index
1019  std::size_t e = 0;
1020  while (c + 1 < classStart.size() && b == classStart[c + 1])
1021  {
1022  c++;
1023  b = bat[c];
1024  e = idx[c];
1025  }
1026  if (b == data.numberOfBatches()) break;
1027 
1028  InputBatchType& bi1 = data.inputs().batch(b);
1029  LabelBatchType& bl1 = data.labels().batch(b);
1030  while (true)
1031  {
1032  unsigned int l = shark::get(bl1, e);
1033  if (l == c) // leave element in place
1034  {
1035  e++;
1036  idx[c] = e;
1037  if (e == boost::size(bl1))
1038  {
1039  e = 0;
1040  idx[c] = 0;
1041  bat[c]++;
1042  break;
1043  }
1044  }
1045  else // swap elements
1046  {
1047  InputBatchType& bi2 = data.inputs().batch(bat[l]);
1048  LabelBatchType& bl2 = data.labels().batch(bat[l]);
1049  swap(shark::get(bi1, e), shark::get(bi2, idx[l]));
1050  shark::get(bl1, e) = shark::get(bl2, idx[l]);
1051  shark::get(bl2, idx[l]) = l;
1052  idx[l]++;
1053  if (idx[l] == boost::size(bl2))
1054  {
1055  idx[l] = 0;
1056  bat[l]++;
1057  }
1058  }
1059  }
1060  }
1061 }
1062 
1063 template<class I>
1065  LabeledData<I,unsigned int>const& data,
1066  unsigned int zeroClass,
1067  unsigned int oneClass
1068 ){
1069  std::vector<std::size_t> indexSet;
1070  std::size_t smaller = std::min(zeroClass,oneClass);
1071  std::size_t bigger = std::max(zeroClass,oneClass);
1072  std::size_t numBatches = data.numberOfBatches();
1073 
1074  //find first class
1075  std::size_t start= 0;
1076  for(;start != numBatches && get(data.batch(start),0).label != smaller;++start);
1077  SHARK_CHECK(start != numBatches, "[shark::binarySubProblem] class does not exist");
1078 
1079  //copy batch indices of first class
1080  for(;start != numBatches && get(data.batch(start),0).label == smaller; ++start)
1081  indexSet.push_back(start);
1082 
1083  //find second class
1084  for(;start != numBatches && get(data.batch(start),0).label != bigger;++start);
1085  SHARK_CHECK(start != numBatches, "[shark::binarySubProblem] class does not exist");
1086 
1087  //copy batch indices of second class
1088  for(;start != numBatches && get(data.batch(start),0).label == bigger; ++start)
1089  indexSet.push_back(start);
1090 
1091  return transformLabels(indexedSubset(data,indexSet), detail::TransformOneVersusRestLabels(oneClass));
1092 }
1093 
1094 /// \brief Construct a binary (two-class) one-versus-rest problem from a multi-class problem.
1095 ///
1096 /// \par
1097 /// The function returns a new LabeledData object. The input part
1098 /// coincides with the multi-class data, but the label part is replaced
1099 /// with binary labels 0 and 1. All instances of the given class
1100 /// (parameter oneClass) get a label of one, all others are assigned a
1101 /// label of zero.
1102 template<class I>
1104  LabeledData<I,unsigned int>const& data,
1105  unsigned int oneClass)
1106 {
1107  return transformLabels(data, detail::TransformOneVersusRestLabels(oneClass));
1108 }
1109 
1110 ///
1111 ///\brief Transformation function multiplying the elements in a dataset by a scalar or component-wise by values stores in a vector
1112 ///
1113 class Multiply {
1114 public:
1115  ///@param factor All components of all vectors in the dataset are multiplied by this number
1116  Multiply(double factor) : m_factor(factor), m_scalar(true) {}
1117  ///@param factor For all elements in the dataset, the i-th component is multiplied with the i-th component of this vector
1118  Multiply(const RealVector factor) : m_factor(0), m_factorv(factor), m_scalar(false) {}
1119 
1120  typedef RealVector result_type;
1121 
1122  RealVector operator()(RealVector input) const {
1123  if(m_scalar) {
1124  for(std::size_t i = 0; i != input.size(); ++i) input(i) *= m_factor;
1125  return input;
1126  } else {
1127  SIZE_CHECK(m_factorv.size() == input.size());
1128  for(std::size_t i = 0; i != input.size(); ++i) input(i) *= m_factorv(i);
1129  return input;
1130  }
1131  }
1132 private:
1133  double m_factor;
1134  RealVector m_factorv;
1135  bool m_scalar;
1136 };
1137 
1138 ///
1139 ///\brief Transformation function dividing the elements in a dataset by a scalar or component-wise by values stores in a vector
1140 ///
1141 class Divide {
1142 public:
1143  ///@param factor All components of all vectors in the dataset are divided by this number
1144  Divide(double factor) : m_factor(factor), m_scalar(true) {}
1145  ///@param factor For all elements in the dataset, the i-th component is divided by the i-th component of this vector
1146  Divide(const RealVector factor) : m_factor(0), m_factorv(factor), m_scalar(false) {}
1147 
1148  typedef RealVector result_type;
1149 
1150  RealVector operator()(RealVector input) const {
1151  if(m_scalar) {
1152  for(std::size_t i = 0; i != input.size(); ++i) input(i) /= m_factor;
1153  return input;
1154  } else {
1155  SIZE_CHECK(m_factorv.size() == input.size());
1156  for(std::size_t i = 0; i != input.size(); ++i) input(i) /= m_factorv(i);
1157  return input;
1158  }
1159  }
1160 private:
1161  double m_factor;
1162  RealVector m_factorv;
1163  bool m_scalar;
1164 };
1165 
1166 
1167 ///
1168 ///\brief Transformation function adding a vector or a scalar to the elements in a dataset
1169 ///
1170 class Shift {
1171 public:
1172  ///@param offset Scalar added to all components of all vectors in the dataset
1173  Shift(double offset) : m_offset(offset), m_scalar(true) {}
1174  ///@param offset Vector added to vectors in the dataset
1175  Shift(const RealVector offset) : m_offsetv(offset), m_scalar(false) {}
1176 
1177  typedef RealVector result_type;
1178 
1179  RealVector operator()(RealVector input) const {
1180  if(m_scalar) {
1181  for(std::size_t i = 0; i != input.size(); ++i)
1182  input(i) += m_offset;
1183  } else {
1184  SIZE_CHECK(m_offsetv.size() == input.size());
1185  for(std::size_t i = 0; i != input.size(); ++i)
1186  input(i) += m_offsetv(i);
1187  }
1188  return input;
1189 
1190  }
1191 private:
1192  double m_offset;
1193  RealVector m_offsetv;
1194  bool m_scalar;
1195 };
1196 
1197 ///
1198 ///\brief Transformation function truncating elements in a dataset
1199 ///
1200 class Truncate {
1201 public:
1202  ///@param minValue All elements below this value are cut to the minimum value
1203  ///@param maxValue All elements above this value are cut to the maximum value
1204  Truncate(double minValue,double maxValue) : m_min(minValue), m_max(maxValue){}
1205  ///@param minv Lower bound for element-wise truncation
1206  ///@param maxv Upper bound for element-wise truncation
1207  Truncate(const RealVector minv, const RealVector maxv) : m_min(1), m_max(-1), m_minv(minv), m_maxv(maxv) { SIZE_CHECK(m_minv.size() == m_maxv.size()); }
1208 
1209  typedef RealVector result_type;
1210 
1211  RealVector operator()(RealVector input) const {
1212  if(m_min < m_max) {
1213  for(std::size_t i = 0; i != input.size(); ++i){
1214  input(i) = std::max(m_min, std::min(m_max, input(i)));
1215  }
1216  } else {
1217  SIZE_CHECK(m_minv.size() == input.size());
1218  for(std::size_t i = 0; i != input.size(); ++i){
1219  input(i) = std::max(m_minv(i), std::min(m_maxv(i), input(i)));
1220  }
1221  }
1222  return input;
1223  }
1224 private:
1225  double m_min;
1226  double m_max;
1227  RealVector m_minv;
1228  RealVector m_maxv;
1229 };
1230 
1231 ///
1232 ///\brief Transformation function first truncating and then rescaling elements in a dataset
1233 ///
1235 public:
1236  ///@param minCutValue All elements below this value are cut to the minimum value
1237  ///@param maxCutValue All elements above this value are cut to the maximum value
1238  ///@param minValue The imterval [minCutValue, maxCutValue] is mapped to [minValue, maxValue]
1239  ///@param maxValue The imterval [minCutValue, maxCutValue] is mapped to [minValue, maxValue]
1240  TruncateAndRescale(double minCutValue, double maxCutValue, double minValue = 0., double maxValue = 1.) : m_minCut(minCutValue), m_maxCut(maxCutValue), m_range(maxValue - minValue), m_min(minValue), m_scalar(true) {}
1241  ///@param minv Lower bound for element-wise truncation
1242  ///@param maxv Upper bound for element-wise truncation
1243  ///@param minValue The imterval [minv, maxv is mapped to [minValue, maxValue]
1244  ///@param maxValue The imterval [minv, maxv] is mapped to [minValue, maxValue]
1245  TruncateAndRescale(const RealVector minv, const RealVector maxv, double minValue = 0., double maxValue = 1.) : m_minCutv(minv), m_maxCutv(maxv), m_range(maxValue - minValue), m_min(minValue), m_scalar(false) { SIZE_CHECK(m_minCutv.size() == m_maxCutv.size()); }
1246 
1247  typedef RealVector result_type;
1248 
1249  RealVector operator()(RealVector input) const {
1250  if(m_scalar) {
1251  for(std::size_t i = 0; i != input.size(); ++i){
1252  input(i) = (std::max(m_minCut, std::min(m_maxCut, input(i))) - m_minCut) / (m_maxCut - m_minCut) * m_range + m_min;
1253  }
1254  } else {
1255  SIZE_CHECK(m_minCutv.size() == input.size());
1256  for(std::size_t i = 0; i != input.size(); ++i){
1257  input(i) = (std::max(m_minCutv(i), std::min(m_maxCutv(i), input(i))) - m_minCutv(i)) / (m_maxCutv(i) - m_minCutv(i)) * m_range + m_min;
1258  }
1259  }
1260  return input;
1261  }
1262 private:
1263  double m_minCut;
1264  double m_maxCut;
1265  RealVector m_minCutv;
1266  RealVector m_maxCutv;
1267  double m_range; // maximum - minimum
1268  double m_min;
1269  bool m_scalar;
1270 };
1271 
1272 
1273 template <typename RowType> RowType getColumn(const Data<RowType> &data, std::size_t columnID) {
1274  SHARK_ASSERT(dataDimension(data) > columnID);
1276  std::size_t rowCounter = 0;
1277  BOOST_FOREACH(typename Data<RowType>::const_element_reference row, data.elements()){
1278  column(rowCounter) = row(columnID);
1279  rowCounter++;
1280  }
1281  return column;
1282 }
1283 
1284 template <typename RowType> void setColumn(Data<RowType> &data, std::size_t columnID, RowType newColumn) {
1285  SHARK_ASSERT(dataDimension(data) > columnID);
1286  SHARK_ASSERT(data.numberOfElements() == newColumn.size());
1287  std::size_t rowCounter = 0;
1288  BOOST_FOREACH(typename Data<RowType>::element_reference row, data.elements()){
1289  row(columnID) = newColumn(rowCounter);
1290  rowCounter++;
1291  }
1292 }
1293 
1294 /** @*/
1295 }
1296 
1297 #endif