Subsets.cpp
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Data Subsets
6  *
7  * This file is part of the tutorial "Creating and Using Subsets of Data".
8  * By itself, it does not do anything particularly useful.
9  *
10  * \author T. Glasmachers
11  * \date 2014
12  *
13  *
14  * \par Copyright 1995-2015 Shark Development Team
15  *
16  * <BR><HR>
17  * This file is part of Shark.
18  * <http://image.diku.dk/shark/>
19  *
20  * Shark is free software: you can redistribute it and/or modify
21  * it under the terms of the GNU Lesser General Public License as published
22  * by the Free Software Foundation, either version 3 of the License, or
23  * (at your option) any later version.
24  *
25  * Shark is distributed in the hope that it will be useful,
26  * but WITHOUT ANY WARRANTY; without even the implied warranty of
27  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28  * GNU Lesser General Public License for more details.
29  *
30  * You should have received a copy of the GNU Lesser General Public License
31  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
32  *
33  */
34 //===========================================================================
35 
36 #include <shark/Data/Dataset.h>
37 using namespace shark;
38 
40 
41 #include <shark/Data/DataView.h>
42 
43 
44 int main()
45 {
46  typedef RealVector I;
47  typedef unsigned int L;
48 
49  std::size_t start = 0, end = 42;
50 
51 {
52  LabeledData<I,L> dataset; // our dataset
53 
54  // create an indexed subset of batches
55  std::vector<std::size_t> indices; // indices of the batches to be contained in the subset
56  LabeledData<I,L> subset = indexedSubset(dataset, indices);
57 
58  // if also the complement of the set is needed, the call is:
59  LabeledData<I,L> complement;
60  dataset.indexedSubset(indices, subset, complement);
61 
62  // create subsets from ranges of batches
63  LabeledData<I,L> range1 = rangeSubset(dataset, start, end); // contains batches start,...,end-1
64  LabeledData<I,L> range2 = rangeSubset(dataset, end); // contains batches 0,...,end-1
65 
66  unsigned int k = 7;
67  LabeledData<I,L> remaining_batches = dataset.splice(k);
68 
69  LabeledData<I,L> remaining_elements = splitAtElement(dataset, k);
70 }
71 
73  // ...
74  repartitionByClass(data);
75 
76  std::size_t class0 = 0, class1 = 1;
77  ClassificationDataset subproblem = binarySubProblem(data, class0, class1);
78 
79 {
81 
82  // creating a random subset from indices
83  std::size_t k = 100;
84  std::vector<std::size_t> indices(view.size());
85  for (std::size_t i=0; i<view.size(); i++) indices[i] = i;
86  for (std::size_t i=0; i<k; i++) std::swap(indices[i], indices[rand() % view.size()]);
87  indices.resize(k);
88  DataView<ClassificationDataset> subset1 = subset(view, indices);
89 
90  // same functionality in one line
92 }
93 
94 {
95  std::size_t numberOfPartitions = 5;
96  std::vector<std::size_t> indices;
97  // Creates partitions of approximately the same size.
98  createCVSameSize(data, numberOfPartitions);
99 
100  // Creates IID drawn partitions of the data set (without replacement).
101  createCVIID(data, numberOfPartitions);
102 
103  // Creates indexed cross-validation sets. For each element the
104  // index describes the fold in which the data point acts as a
105  // validation example. This function offers maximal control.
106  createCVIndexed(data, numberOfPartitions, indices);
107 
108  createCVSameSizeBalanced(data, numberOfPartitions);
109 
110 }
111 
112 {
113  std::size_t numberOfPartitions = 5;
114  std::size_t numberOfFolds = 3;
116 
117  for (std::size_t i=0; i<numberOfPartitions; i++)
118  {
119  // as created in the above example
120  RegressionDataset training = folds.training(i);
121  RegressionDataset validation = folds.validation(i);
122  // explicit copy!
123  training.makeIndependent();
124  // creating a new fold
125  CVFolds<RegressionDataset> innerFolds = createCVSameSize(training, numberOfFolds);
126  }
127 }
128 
129 }