35 #ifndef SHARK_DATA_CVDATASETTOOLS_H 36 #define SHARK_DATA_CVDATASETTOOLS_H 49 template<
class DatasetTypeT>
53 typedef typename DatasetType::IndexSet
IndexSet;
61 DatasetType
const &
set,
62 std::vector<IndexSet>
const &validationIndizes
63 ) : m_dataset(set),m_validationFolds(validationIndizes) {}
66 DatasetType
const &
set,
67 std::vector<std::size_t>
const &foldStart
69 for (std::size_t partition = 0; partition != foldStart.size(); partition++) {
70 std::size_t partitionSize = (partition+1 == foldStart.size()) ?
set.numberOfBatches() : foldStart[partition+1];
71 partitionSize -= foldStart[partition];
74 IndexSet validationIndizes(partitionSize);
75 for (std::size_t batch = 0; batch != partitionSize; ++batch) {
76 validationIndizes[batch]=batch+foldStart[partition];
78 m_validationFolds.push_back(validationIndizes);
96 return m_validationFolds[i];
101 IndexSet trainingFold;
102 detail::complement(m_validationFolds[i], m_dataset.numberOfBatches(), trainingFold);
108 return m_validationFolds.size();
137 DatasetType m_dataset;
138 std::vector<IndexSet> m_validationFolds;
139 std::size_t m_datasetSize;
140 std::vector<std::size_t> m_validationFoldSizes;
153 template<
class I,
class L>
156 std::size_t numberOfPartitions,
157 std::vector< std::vector<std::size_t> > members,
158 std::size_t batchSize,
159 RecreationIndices * cv_indices = NULL
165 std::size_t numInputs =
set.numberOfElements();
166 std::size_t numClasses = members.size();
170 for (std::size_t c = 0; c != numClasses; c++) {
171 std::random_shuffle(members[c].begin(), members[c].end(), uni);
175 std::size_t nn = numInputs / numberOfPartitions;
176 std::size_t leftOver = numInputs % nn;
177 std::vector<std::size_t> validationSize(numberOfPartitions,nn);
178 for (std::size_t partition = 0; partition != leftOver; partition++) {
179 validationSize[partition]++;
183 std::vector<std::size_t> partitionStart;
184 std::vector<std::size_t> batchSizes;
185 std::size_t numBatches = batchPartitioning(validationSize,partitionStart,batchSizes,batchSize);
190 std::vector<std::size_t> validationSetStart = partitionStart;
192 std::size_t fold = 0;
193 std::vector<std::vector<std::size_t> > batchElements(numberOfPartitions);
196 if ( cv_indices != NULL ) {
197 cv_indices->first.clear();
198 cv_indices->first.resize( numInputs );
199 cv_indices->second.clear();
200 cv_indices->second.resize( numInputs );
204 for (std::size_t c = 0; c != numClasses; c++) {
205 for (std::size_t i = 0; i != members[c].size(); i++) {
206 std::size_t oldPos = members[c][i];
207 std::size_t batchNumber = validationSetStart[fold];
209 batchElements[fold].push_back(oldPos);
211 if ( cv_indices != NULL ) {
212 cv_indices->first[ j ] = oldPos;
213 cv_indices->second[ j ] = fold;
218 if (batchElements[fold].
size() == batchSizes[batchNumber]) {
219 newSet.
batch(validationSetStart[fold]) =
subBatch(setView,batchElements[fold]);
220 batchElements[fold].clear();
221 ++validationSetStart[fold];
224 fold = (fold+1) % numberOfPartitions;
258 template<
class I,
class L>
260 std::size_t numberOfPartitions,
262 std::vector<std::size_t> indices(
set.numberOfElements());
263 for (std::size_t i=0; i !=
set.numberOfElements(); i++)
279 template<
class I,
class L>
281 std::size_t numInputs =
set.numberOfElements();
284 std::vector<std::size_t> validationSize(numberOfPartitions);
285 std::size_t inputsForValidation = numInputs / numberOfPartitions;
286 std::size_t leftOver = numInputs - inputsForValidation * numberOfPartitions;
287 for (std::size_t i = 0; i != numberOfPartitions; i++) {
288 std::size_t vs=inputsForValidation+(i<leftOver);
289 validationSize[i] =vs;
293 std::vector<std::size_t> partitionStart;
294 std::vector<std::size_t> batchSizes;
295 detail::batchPartitioning(validationSize,partitionStart,batchSizes,batchSize);
297 set.repartition(batchSizes);
319 std::size_t numberOfPartitions,
321 RecreationIndices * cv_indices = NULL
324 std::size_t numInputs = setView.
size();
329 std::vector< std::vector<std::size_t> > members(numClasses);
330 for (std::size_t i = 0; i != numInputs; i++) {
331 members[setView[i].label].push_back(i);
345 template<
class I,
class L>
348 std::size_t numberOfPartitions
350 std::vector<std::size_t> indizes(
set.numberOfBatches());
351 for(std::size_t i= 0; i !=
set.numberOfBatches(); ++i)
358 std::vector<IndexSet> folds;
359 std::size_t partitionSize =
set.numberOfBatches()/numberOfPartitions;
360 std::size_t remainder =
set.numberOfBatches() - partitionSize*numberOfPartitions;
361 std::vector<std::size_t>::iterator pos = indizes.begin();
362 for(std::size_t i = 0; i!= numberOfPartitions; ++i){
363 std::size_t
size = partitionSize;
368 folds.push_back(
IndexSet(pos,pos+size));
385 template<
class I,
class L>
388 std::size_t numberOfPartitions,
389 std::vector<std::size_t> indices,
392 std::size_t numInputs =
set.numberOfElements();
394 SIZE_CHECK(numberOfPartitions == *std::max_element(indices.begin(),indices.end())+1);
397 std::vector<std::size_t> validationSize(numberOfPartitions,0);
398 for (std::size_t input = 0; input != numInputs; input++) {
399 validationSize[indices[input]]++;
403 std::vector<std::size_t> partitionStart;
404 std::vector<std::size_t> batchSizes;
405 std::size_t numBatches = detail::batchPartitioning(validationSize,partitionStart,batchSizes,batchSize);
410 std::vector<std::size_t> validationSetStart = partitionStart;
411 std::vector<std::vector<std::size_t> > batchElements(numberOfPartitions);
412 for (std::size_t input = 0; input != numInputs; input++) {
413 std::size_t partition = indices[input];
414 batchElements[partition].push_back(input);
417 std::size_t batchNumber = validationSetStart[partition];
418 if (batchElements[partition].
size() == batchSizes[batchNumber]) {
419 newSet.
batch(validationSetStart[partition]) =
subBatch(setView,batchElements[partition]);
420 batchElements[partition].clear();
421 ++validationSetStart[partition];
446 template<
class I,
class L>
449 std::size_t numberOfPartitions,
450 RecreationIndices indices,
453 std::size_t numInputs =
set.numberOfElements();
454 SIZE_CHECK(indices.first.size() == numInputs);
455 SIZE_CHECK(indices.second.size() == numInputs);
456 SIZE_CHECK(numberOfPartitions == *std::max_element(indices.second.begin(),indices.second.end())+1);
459 std::vector<std::size_t> validationSize(numberOfPartitions,0);
460 for (std::size_t input = 0; input != numInputs; input++) {
461 validationSize[indices.second[input]]++;
465 std::vector<std::size_t> partitionStart;
466 std::vector<std::size_t> batchSizes;
467 std::size_t numBatches = detail::batchPartitioning(validationSize,partitionStart,batchSizes,batchSize);
472 std::vector<std::size_t> validationSetStart = partitionStart;
473 std::vector<std::vector<std::size_t> > batchElements(numberOfPartitions);
474 for (std::size_t input = 0; input != numInputs; input++) {
475 std::size_t partition = indices.second[input];
476 batchElements[partition].push_back( indices.first[input] );
479 std::size_t batchNumber = validationSetStart[partition];
480 if (batchElements[partition].
size() == batchSizes[batchNumber]) {
481 newSet.
batch(validationSetStart[partition]) =
subBatch(setView,batchElements[partition]);
482 batchElements[partition].clear();
483 ++validationSetStart[partition];
496 #include "Impl/CVDatasetTools.inl"