DataView.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Fast lookup for elements in constant datasets
6  *
7  *
8  *
9  *
10  *
11  * \author O. Krause
12  * \date 2012
13  *
14  *
15  * \par Copyright 1995-2015 Shark Development Team
16  *
17  * <BR><HR>
18  * This file is part of Shark.
19  * <http://image.diku.dk/shark/>
20  *
21  * Shark is free software: you can redistribute it and/or modify
22  * it under the terms of the GNU Lesser General Public License as published
23  * by the Free Software Foundation, either version 3 of the License, or
24  * (at your option) any later version.
25  *
26  * Shark is distributed in the hope that it will be useful,
27  * but WITHOUT ANY WARRANTY; without even the implied warranty of
28  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29  * GNU Lesser General Public License for more details.
30  *
31  * You should have received a copy of the GNU Lesser General Public License
32  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
33  *
34  */
35 //===========================================================================
36 
37 
38 #ifndef SHARK_DATA_DATAVIEW_H
39 #define SHARK_DATA_DATAVIEW_H
40 
41 #include <shark/Data/Dataset.h>
43 #include <boost/type_traits/is_const.hpp>
44 #include <boost/range/adaptor/transformed.hpp>
45 #include <boost/bind.hpp>
46 #include <boost/range/algorithm/copy.hpp>
47 namespace shark {
48 /// \brief Constant time Element-Lookup for Datasets
49 ///
50 /// Datasets are fast for random lookup of batches. Since batch sizes can be arbitrary structured and
51 /// changed by the user, there is no way for the Data and LabeledData classes to provide fast random access
52 /// to single elements. Still, this property is needed quite often, for example for creating subsets,
53 /// randomize data or tree structures.
54 /// A View stores the position of every element in a dataset. So it has constant time access to the elements but
55 /// it also requires linear memory in the number of elements in the set. This is typically small compared
56 /// to the size of the set itself, but construction imposes an considerable overhead.
57 ///
58 /// In contrast to (Un)LabeledData, which is centered around batches, the View is centered around single elements,
59 /// so its iterators iterate over the elements.
60 /// For a better support for bagging an index method is added which returns the position of the element in the
61 /// underlying data container. Also the iterators are indexed and return this index.
62 template <class DatasetType> //template parameter can be const!
63 class DataView
64 {
65 public:
66  typedef typename boost::remove_const<DatasetType>::type dataset_type; //(non const) type of the underlying dataset
67  typedef typename dataset_type::element_type value_type;
68  typedef typename dataset_type::const_element_reference const_reference;
69  typedef typename dataset_type::batch_type batch_type;
70  // We want to support immutable as well as mutable datasets. So we query whether the dataset
71  // is mutable and change the reference type to const if the dataset is immutable.
72  typedef typename boost::mpl::if_<
73  boost::is_const<DatasetType>,
74  typename dataset_type::const_element_reference,
75  typename dataset_type::element_reference
76  >::type reference;
77 
78 private:
79  typedef typename boost::mpl::if_<
80  boost::is_const<DatasetType>,
81  typename dataset_type::const_batch_range,
82  typename dataset_type::batch_range
83  >::type batch_range;
84  template<class Reference, class View>
85  class IteratorBase: public SHARK_ITERATOR_FACADE<
86  IteratorBase<Reference,View>,
87  value_type,
88  std::random_access_iterator_tag,
89  Reference
90  >{
91  public:
92  IteratorBase(){}
93 
94  IteratorBase(View& view, std::size_t position)
95  : mpe_view(&view),m_position(position) {}
96 
97  template<class R,class V>
98  IteratorBase(IteratorBase<R,V> const& other)
99  : mpe_view(other.mpe_view),m_position(other.position){}
100 
101  /// \brief returns the position of the element referenced by the iterator inside the dataset
102  ///
103  /// This is usefull for bagging, when identical elements between several susbsets are to be identified
104  std::size_t index()const{
105  return mpe_view->index(m_position);
106  }
107 
108  private:
109  friend class SHARK_ITERATOR_CORE_ACCESS;
110  template <class, class> friend class IteratorBase;
111 
112  void increment() {
113  ++m_position;
114  }
115  void decrement() {
116  --m_position;
117  }
118 
119  void advance(std::ptrdiff_t n){
120  m_position+=n;
121  }
122 
123  template<class R,class V>
124  std::ptrdiff_t distance_to(IteratorBase<R,V> const& other) const{
125  return (std::ptrdiff_t)other.m_position - (std::ptrdiff_t)m_position;
126  }
127 
128  template<class R,class V>
129  bool equal(IteratorBase<R,V> const& other) const{
130  return m_position == other.m_position;
131  }
132  Reference dereference() const {
133  return (*mpe_view)[m_position];
134  }
135 
136  View* mpe_view;
137  std::size_t m_position;
138  };
139 public:
140  typedef IteratorBase<reference,DataView<DatasetType> > iterator;
141  typedef IteratorBase<const_reference, DataView<DatasetType> const > const_iterator;
142 
144  DataView(DatasetType& dataset)
145  :m_dataset(dataset),m_indices(dataset.numberOfElements())
146  {
147  std::size_t index = 0;
148  for(std::size_t i = 0; i != dataset.numberOfBatches(); ++i){
149  std::size_t batchSize = shark::size(dataset.batch(i));
150  for(std::size_t j = 0; j != batchSize; ++j,++index){
151  m_indices[index].batch = i;
152  m_indices[index].positionInBatch = j;
153  m_indices[index].datasetIndex = index;
154  }
155  }
156  }
157 
158  /// create a subset of the dataset type using only the elemnt indexed by indices
159  template<class IndexRange>
160  DataView(DataView<DatasetType> const& view, IndexRange const& indices)
161  :m_dataset(view.m_dataset),m_indices(shark::size(indices))
162  {
163  for(std::size_t i = 0; i != m_indices.size(); ++i)
164  m_indices[i] = view.m_indices[indices[i]];
165  }
166 
167  reference operator[](std::size_t position){
168  SIZE_CHECK(position < size());
169  Index const& index = m_indices[position];
170  return get(m_dataset.batch(index.batch),index.positionInBatch);
171  }
172  const_reference operator[](std::size_t position) const{
173  SIZE_CHECK(position < size());
174  Index const& index = m_indices[position];
175  return get(m_dataset.batch(index.batch),index.positionInBatch);
176  }
177  /// \brief returns the position of the element inside the dataset
178  ///
179  /// This is useful for bagging, when identical elements among
180  /// several subsets are to be identified.
181  std::size_t index(std::size_t position)const{
182  return m_indices[position].datasetIndex;
183  }
184 
185  std::size_t size() const{
186  return m_indices.size();
187  }
188 
189  iterator begin(){
190  return iterator(*this, 0);
191  }
192  const_iterator begin() const{
193  return const_iterator(*this, 0);
194  }
195  iterator end(){
196  return iterator(*this, size());
197  }
198  const_iterator end() const{
199  return const_iterator(*this, size());
200  }
201 
202  dataset_type const& dataset()const{
203  return m_dataset;
204  }
205 private:
206  dataset_type m_dataset;
207  // Stores for an element of the dataset, at which position of which batch it is located
208  // as well as the real index of the element inside the dataset
209  struct Index{
210  std::size_t batch;//the batch in which the element is located
211  std::size_t positionInBatch;//at which position in the batch it is
212  std::size_t datasetIndex;//index inside the dataset
213  };
214  std::vector<Index> m_indices;//stores for every element of the set it's position inside the dataset
215 };
216 
217 /// \brief creates a subset of a DataView with elements indexed by indices
218 ///
219 /// \param view the view for which the subset is to be created
220 /// \param indizes the index of the elements to be stored in the view
221 template<class DatasetType,class IndexRange>
222 DataView<DatasetType> subset(DataView<DatasetType> const& view, IndexRange const& indizes){
223  //O.K. todo: Remove constructor later on, this is a quick fix.
224  return DataView<DatasetType>(view,indizes);
225 }
226 
227 /// \brief creates a random subset of a DataView with given size
228 ///
229 /// \param view the view for which the subset is to be created
230 /// \param size the size of the subset
231 template<class DatasetType>
233  std::vector<std::size_t> indices(view.size());
234  boost::iota(indices,0);
235  partial_shuffle(indices.begin(),indices.begin()+size,indices.end());
236  return subset(view,boost::make_iterator_range(indices.begin(),indices.begin()+size));
237 }
238 
239 /// \brief Creates a batch given a set of indices
240 ///
241 /// \param view the view from which the batch is to be created
242 /// \param indizes the set of indizes defining the batch
243 template<class DatasetType,class IndexRange>
244 typename DataView<DatasetType>::batch_type subBatch(
245  DataView<DatasetType> const& view,
246  IndexRange const& indizes
247 ){
248  //create a subset of the view containing the elements of the batch
249  DataView<DatasetType> batchElems = subset(view,indizes);
250 
251  //and now use the batch range construction to create it
253 }
254 
255 /// \brief Creates a random batch of a given size
256 ///
257 /// \param view the view from which the batch is to be created
258 /// \param size the size of the batch
259 template<class DatasetType>
260 typename DataView<DatasetType>::batch_type randomSubBatch(
261  DataView<DatasetType> const& view,
262  std::size_t size
263 ){
264  std::vector<std::size_t> indices(view.size());
265  boost::iota(indices,0);
266  partial_shuffle(indices.begin(),indices.begin()+size,indices.end());
267  return subBatch(view,boost::make_iterator_range(indices.begin(),indices.begin()+size));
268 }
269 
270 /// \brief Creates a View from a dataset.
271 ///
272 /// This is just a helper function to omit the actual type of the view
273 ///
274 /// \param set the dataset from which to create the view
275 template<class DatasetType>
276 DataView<DatasetType> toView(DatasetType& set){
277  return DataView<DatasetType>(set);
278 }
279 
280 /// \brief Creates a new dataset from a View.
281 ///
282 /// When the elements of a View needs to be processed repeatedly it is often better to use
283 /// the packed format of the Dataset again, since then the faster batch processing can be used
284 ///
285 /// \param view the view from which to create the new dataset
286 /// \param batchSize the size of the batches in the dataset
287 template<class T>
288 typename DataView<T>::dataset_type
290  if(view.size() == 0)
291  return typename DataView<T>::dataset_type();
292  //O.K. todo: this is slow for sparse elements, use subBatch or something similar.
293  std::size_t elements = view.size();
294  typename DataView<T>::dataset_type dataset(elements,view[0],batchSize);
295  std::size_t batches = dataset.numberOfBatches();
296 
297  std::size_t element = 0;
298  for(std::size_t i = 0; i != batches; ++i){
299  std::size_t batchSize = shark::size(dataset.batch(i));
300  for(std::size_t j = 0; j != batchSize; ++j, ++element){
301  get(dataset.batch(i),j) = view[element];
302  }
303  }
304  return dataset;
305 }
306 
307 /// Return the number of classes (size of the label vector)
308 /// of a classification dataset with RealVector label encoding.
309 template <class DatasetType>
310 std::size_t numberOfClasses(DataView<DatasetType> const& view){
311  return numberOfClasses(view.dataset());
312 }
313 
314 /// Return the input dimensionality of the labeled dataset represented by the view
315 template <class DatasetType>
316 std::size_t inputDimension(DataView<DatasetType> const& view){
317  return inputDimension(view.dataset());
318 }
319 /// Return the label dimensionality of the labeled dataset represented by the view
320 template <class DatasetType>
321 std::size_t labelDimension(DataView<DatasetType> const& view){
322  return labelDimension(view.dataset());
323 }
324 
325 /// Return the dimensionality of the dataset represented by the view
326 template <class DatasetType>
327 std::size_t dataDimension(DataView<DatasetType> const& view){
328  return dataDimension(view.dataset());
329 }
330 
331 
332 /** @*/
333 }
334 #endif