45 #ifndef SHARK_DATA_DATASET_H 46 #define SHARK_DATA_DATASET_H 48 #include <boost/foreach.hpp> 49 #include <boost/range/iterator_range.hpp> 50 #include <boost/range/algorithm/sort.hpp> 56 #include "Impl/Dataset.inl" 143 template <
class Type>
157 typedef typename Container::BatchType batch_type;
161 typedef Type element_type;
172 typedef boost::iterator_range<typename Container::element_iterator>
element_range;
174 typedef boost::iterator_range<typename Container::iterator>
batch_range;
210 return m_data.size();
214 return m_data.numberOfElements();
219 return m_data.empty();
224 return *(m_data.elemBegin()+i);
226 const_element_reference
element(std::size_t i)
const{
227 return *(m_data.elemBegin()+i);
231 batch_reference
batch(std::size_t i){
232 return *(m_data.begin()+i);
234 const_batch_reference
batch(std::size_t i)
const{
235 return *(m_data.begin()+i);
244 explicit Data(std::size_t numBatches) : m_data( numBatches )
248 explicit Data(
Data const& container, std::vector<std::size_t> batchSizes)
249 : m_data( container.m_data, batchSizes, true )
259 explicit Data(std::size_t
size, element_type
const&
element, std::size_t batchSize = DefaultBatchSize)
260 : m_data(size,element,batchSize)
274 m_data.makeIndependent();
281 m_data.splitBatch(m_data.begin()+
batch,elementIndex);
299 m_data.append(other.m_data);
303 m_data.push_back(batch);
310 template<
class Range>
312 m_data.repartition(batchSizes);
320 return m_data.getPartitioning();
332 detail::complement(indices,m_data.size(),comp);
334 complement.m_data=
Container(m_data,comp);
349 std::ostream &operator << (std::ostream &stream, const Data<T>& d) {
352 BOOST_FOREACH(reference elem,elements)
353 stream << elem <<
"\n";
363 template <
class InputT>
367 typedef InputT element_type;
374 using base_type::m_data;
394 : base_type(size,element,batchSize)
401 : base_type(numBatches)
406 :base_type(container,batchSizes){}
458 template <
class InputT,
class LabelT>
473 typedef DataBatchPair<
516 return zipPairRange<element_type>(
m_data.elements(),m_label.elements());
523 return zipPairRange<element_type>(
m_data.elements(),m_label.elements());
531 return zipPairRange<batch_type>(
m_data.batches(),m_label.batches());
538 return zipPairRange<batch_type>(
m_data.batches(),m_label.batches());
543 return m_data.numberOfBatches();
547 return m_data.numberOfElements();
583 :
m_data(numBatches),m_label(numBatches)
593 :
m_data(size,element.input,batchSize),
594 m_label(size,element.label,batchSize)
602 :
m_data(inputs), m_label(labels)
615 const_element_reference
element(std::size_t i)
const{
620 batch_reference
batch(std::size_t i){
623 const_batch_reference
batch(std::size_t i)
const{
643 m_label.makeIndependent();
654 m_data.splitBatch(batch,elementIndex);
655 m_label.splitBatch(batch,elementIndex);
663 return self_type(
m_data.splice(batch),m_label.splice(batch));
671 m_data.append(other.m_data);
672 m_label.append(other.m_label);
676 typename Batch<InputType>::type
const& inputs,
680 m_label.push_back(labels);
684 const_batch_reference
batch 694 template<
class Range>
696 m_data.repartition(batchSizes);
697 m_label.repartition(batchSizes);
705 return m_data.getPartitioning();
718 m_data.indexedSubset(indices,subset.m_data);
719 m_label.indexedSubset(indices,subset.m_label);
725 detail::complement(indices,
m_data.numberOfBatches(),comp);
726 m_data.indexedSubset(indices,subset.m_data);
727 m_label.indexedSubset(indices,subset.m_label);
728 m_data.indexedSubset(comp,complement.m_data);
729 m_label.indexedSubset(comp,complement.m_label);
745 template<
class Functor,
class T>
757 template<
class Range>
760 typedef typename boost::range_value<Range const>::type Input;
761 typedef typename boost::range_iterator<Range const>::type Iterator;
763 if (maximumBatchSize == 0)
768 std::size_t
batches = numPoints / maximumBatchSize;
769 if(numPoints > batches*maximumBatchSize)
771 std::size_t optimalBatchSize=numPoints/
batches;
772 std::size_t remainder = numPoints-batches*optimalBatchSize;
776 Iterator start= boost::begin(inputs);
777 for(std::size_t i = 0; i !=
batches; ++i){
778 std::size_t
size = (i<remainder)?optimalBatchSize+1:optimalBatchSize;
779 Iterator end = start+
size;
780 data.
batch(i) = createBatch<Input>(
781 boost::make_iterator_range(start,end)
790 template<
class Range>
796 template<
class Range1,
class Range2>
798 typename boost::range_value<Range1>::type,
799 typename boost::range_value<Range2>::type
802 "[createDataFromRange] number of inputs and number of labels must agree");
803 typedef typename boost::range_value<Range1>::type Input;
804 typedef typename boost::range_value<Range2>::type Label;
816 template<
class T,
class U>
817 std::ostream &operator << (std::ostream &stream, const LabeledData<T, U>& d) {
820 BOOST_FOREACH(reference elem,elements)
821 stream << elem.input <<
" [" << elem.label <<
"]"<<
"\n";
831 unsigned int classes = 0;
833 classes =
std::max(classes,*std::max_element(labels.
batch(i).begin(),labels.
batch(i).end()));
842 std::size_t batchSize =
size(labels.
batch(i));
843 for(std::size_t j = 0; j != batchSize; ++j){
844 classCounts[labels.
batch(i)(j)]++;
851 template <
class InputType>
854 return dataset.
element(0).size();
858 template <
class InputType,
class LabelType>
864 template <
class InputType,
class LabelType>
869 template <
class InputType>
874 template<
class InputType,
class LabelType>
881 template<
class DatasetT>
883 DatasetT
const& dataset,
884 typename DatasetT::IndexSet
const& indices
887 dataset.indexedSubset(indices,subset);
891 template<
class DatasetT>
892 DatasetT
rangeSubset(DatasetT
const& dataset, std::size_t start, std::size_t end){
893 typename DatasetT::IndexSet indices;
894 detail::range(end-start, start, indices);
898 template<
class DatasetT>
907 template<
class T,
class Functor>
908 typename boost::lazy_disable_if<
913 typedef typename detail::TransformedDataElement<Functor,T>::type ResultType;
924 template<
class T,
class Functor>
925 typename boost::lazy_enable_if<
926 CanBeCalled<Functor,typename Data<T>::batch_type>,
930 typedef typename detail::TransformedDataElement<Functor,T>::type ResultType;
939 template<
class I,
class L,
class Functor>
946 template<
class I,
class L,
class Functor>
954 template<
class FeatureSet>
956 return transform(data,detail::SelectFeatures<FeatureSet>(features));
959 template<
class T,
class FeatureSet>
961 return transformInputs(data, detail::SelectFeatures<FeatureSet>(features));
972 template<
class DatasetT>
974 SIZE_CHECK(elementIndex<=data.numberOfElements());
976 std::size_t batchPos = 0;
977 std::size_t batchStart = 0;
978 while(batchStart +
boost::size(data.batch(batchPos)) < elementIndex){
982 std::size_t splitPoint = elementIndex-batchStart;
984 data.splitBatch(batchPos,splitPoint);
988 return data.splice(batchPos);
998 std::vector<std::size_t > classCounts =
classSizes(data);
999 std::vector<std::size_t > partitioning;
1000 std::vector<std::size_t > classStart;
1001 detail::batchPartitioning(classCounts, classStart, partitioning, batchSize);
1011 std::vector<std::size_t> bat = classStart;
1012 std::vector<std::size_t> idx(classStart.size(), 0);
1020 while (c + 1 < classStart.size() && b == classStart[c + 1])
1028 InputBatchType& bi1 = data.
inputs().batch(b);
1029 LabelBatchType& bl1 = data.
labels().batch(b);
1047 InputBatchType& bi2 = data.
inputs().batch(bat[l]);
1048 LabelBatchType& bl2 = data.
labels().batch(bat[l]);
1066 unsigned int zeroClass,
1067 unsigned int oneClass
1069 std::vector<std::size_t> indexSet;
1070 std::size_t smaller =
std::min(zeroClass,oneClass);
1071 std::size_t bigger =
std::max(zeroClass,oneClass);
1075 std::size_t start= 0;
1076 for(;start != numBatches &&
get(data.
batch(start),0).label != smaller;++start);
1077 SHARK_CHECK(start != numBatches,
"[shark::binarySubProblem] class does not exist");
1080 for(;start != numBatches &&
get(data.
batch(start),0).label == smaller; ++start)
1081 indexSet.push_back(start);
1084 for(;start != numBatches &&
get(data.
batch(start),0).label != bigger;++start);
1085 SHARK_CHECK(start != numBatches,
"[shark::binarySubProblem] class does not exist");
1088 for(;start != numBatches &&
get(data.
batch(start),0).label == bigger; ++start)
1089 indexSet.push_back(start);
1105 unsigned int oneClass)
1107 return transformLabels(data, detail::TransformOneVersusRestLabels(oneClass));
1116 Multiply(
double factor) : m_factor(factor), m_scalar(true) {}
1118 Multiply(
const RealVector factor) : m_factor(0), m_factorv(factor), m_scalar(false) {}
1120 typedef RealVector result_type;
1124 for(std::size_t i = 0; i != input.size(); ++i) input(i) *= m_factor;
1127 SIZE_CHECK(m_factorv.size() == input.size());
1128 for(std::size_t i = 0; i != input.size(); ++i) input(i) *= m_factorv(i);
1134 RealVector m_factorv;
1144 Divide(
double factor) : m_factor(factor), m_scalar(true) {}
1146 Divide(
const RealVector factor) : m_factor(0), m_factorv(factor), m_scalar(false) {}
1148 typedef RealVector result_type;
1152 for(std::size_t i = 0; i != input.size(); ++i) input(i) /= m_factor;
1155 SIZE_CHECK(m_factorv.size() == input.size());
1156 for(std::size_t i = 0; i != input.size(); ++i) input(i) /= m_factorv(i);
1162 RealVector m_factorv;
1173 Shift(
double offset) : m_offset(offset), m_scalar(true) {}
1175 Shift(
const RealVector offset) : m_offsetv(offset), m_scalar(false) {}
1177 typedef RealVector result_type;
1181 for(std::size_t i = 0; i != input.size(); ++i)
1182 input(i) += m_offset;
1184 SIZE_CHECK(m_offsetv.size() == input.size());
1185 for(std::size_t i = 0; i != input.size(); ++i)
1186 input(i) += m_offsetv(i);
1193 RealVector m_offsetv;
1204 Truncate(
double minValue,
double maxValue) : m_min(minValue), m_max(maxValue){}
1207 Truncate(
const RealVector minv,
const RealVector maxv) : m_min(1), m_max(-1), m_minv(minv), m_maxv(maxv) {
SIZE_CHECK(m_minv.size() == m_maxv.size()); }
1209 typedef RealVector result_type;
1213 for(std::size_t i = 0; i != input.size(); ++i){
1218 for(std::size_t i = 0; i != input.size(); ++i){
1240 TruncateAndRescale(
double minCutValue,
double maxCutValue,
double minValue = 0.,
double maxValue = 1.) : m_minCut(minCutValue), m_maxCut(maxCutValue), m_range(maxValue - minValue), m_min(minValue), m_scalar(true) {}
1245 TruncateAndRescale(
const RealVector minv,
const RealVector maxv,
double minValue = 0.,
double maxValue = 1.) : m_minCutv(minv), m_maxCutv(maxv), m_range(maxValue - minValue), m_min(minValue), m_scalar(false) {
SIZE_CHECK(m_minCutv.size() == m_maxCutv.size()); }
1247 typedef RealVector result_type;
1251 for(std::size_t i = 0; i != input.size(); ++i){
1252 input(i) = (
std::max(m_minCut,
std::min(m_maxCut, input(i))) - m_minCut) / (m_maxCut - m_minCut) * m_range + m_min;
1255 SIZE_CHECK(m_minCutv.size() == input.size());
1256 for(std::size_t i = 0; i != input.size(); ++i){
1257 input(i) = (
std::max(m_minCutv(i),
std::min(m_maxCutv(i), input(i))) - m_minCutv(i)) / (m_maxCutv(i) - m_minCutv(i)) * m_range + m_min;
1265 RealVector m_minCutv;
1266 RealVector m_maxCutv;
1276 std::size_t rowCounter = 0;
1287 std::size_t rowCounter = 0;
1289 row(columnID) = newColumn(rowCounter);