SparseData.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Support for importing and exporting data from and to sparse data (libSVM) formatted data files
6  *
7  *
8  * \par
9  * The most important application of the methods provided in this
10  * file is the import of data from LIBSVM files to Shark Data containers.
11  *
12  *
13  *
14  *
15  * \author M. Tuma, T. Glasmachers, C. Igel
16  * \date 2010
17  *
18  *
19  * \par Copyright 1995-2015 Shark Development Team
20  *
21  * <BR><HR>
22  * This file is part of Shark.
23  * <http://image.diku.dk/shark/>
24  *
25  * Shark is free software: you can redistribute it and/or modify
26  * it under the terms of the GNU Lesser General Public License as published
27  * by the Free Software Foundation, either version 3 of the License, or
28  * (at your option) any later version.
29  *
30  * Shark is distributed in the hope that it will be useful,
31  * but WITHOUT ANY WARRANTY; without even the implied warranty of
32  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33  * GNU Lesser General Public License for more details.
34  *
35  * You should have received a copy of the GNU Lesser General Public License
36  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
37  *
38  */
39 //===========================================================================
40 
41 #ifndef SHARK_DATA_SPARSEDATA_H
42 #define SHARK_DATA_SPARSEDATA_H
43 
44 #include <shark/Core/DLLSupport.h>
45 #include <fstream>
46 #include <shark/Data/Dataset.h>
47 
48 namespace shark {
49 
50 namespace detail {
51 
52 typedef std::pair< unsigned int, size_t > LabelSortPair;
53 static inline bool cmpLabelSortPair(const LabelSortPair& left, const LabelSortPair& right) {
54  return left.first > right.first; // for sorting in decreasing order
55 }
56 
57 } // namespace detail
58 
59 /**
60  * \ingroup shark_globals
61  *
62  * @{
63  */
64 
65 
66 
67 /// \brief Import data from a sparse data (libSVM) file.
68 ///
69 /// \param dataset container storing the loaded data
70 /// \param stream stream to be read from
71 /// \param highestIndex highest feature index, or 0 for auto-detection
72 /// \param batchSize size of batch
74  LabeledData<RealVector, unsigned int>& dataset,
75  std::istream& stream,
76  unsigned int highestIndex = 0,
77  std::size_t batchSize = LabeledData<RealVector, unsigned int>::DefaultBatchSize
78 );
79 
80 /// \brief Import data from a sparse data (libSVM) file.
81 ///
82 /// \param dataset container storing the loaded data
83 /// \param stream stream to be read from
84 /// \param highestIndex highest feature index, or 0 for auto-detection
85 /// \param batchSize size of batch
87  LabeledData<CompressedRealVector, unsigned int>& dataset,
88  std::istream& stream,
89  unsigned int highestIndex = 0,
90  std::size_t batchSize = LabeledData<RealVector, unsigned int>::DefaultBatchSize
91 );
92 
93 /// \brief Import data from a sparse data (libSVM) file.
94 ///
95 /// \param dataset container storing the loaded data
96 /// \param fn the file to be read from
97 /// \param highestIndex highest feature index, or 0 for auto-detection
98 /// \param batchSize size of batch
100  LabeledData<RealVector, unsigned int>& dataset,
101  std::string fn,
102  unsigned int highestIndex = 0,
103  std::size_t batchSize = LabeledData<RealVector, unsigned int>::DefaultBatchSize
104 );
105 
106 /// \brief Import data from a sparse data (libSVM) file.
107 ///
108 /// \param dataset container storing the loaded data
109 /// \param fn the file to be read from
110 /// \param highestIndex highest feature index, or 0 for auto-detection
111 /// \param batchSize size of batch
113  LabeledData<CompressedRealVector, unsigned int>& dataset,
114  std::string fn,
115  unsigned int highestIndex = 0,
116  std::size_t batchSize = LabeledData<RealVector, unsigned int>::DefaultBatchSize
117 );
118 
119 
120 /// \brief Export data to sparse data (libSVM) format.
121 ///
122 /// \param dataset Container storing the data
123 /// \param fn Output file
124 /// \param dense Flag for using dense output format
125 /// \param oneMinusOne Flag for applying the transformation y<-2y-1 to binary labels
126 /// \param sortLabels Flag for sorting data points according to labels
127 /// \param append Flag for appending to the output file instead of overwriting it
128 template<typename InputType>
129 void exportSparseData(LabeledData<InputType, unsigned int>& dataset, const std::string &fn, bool dense=false, bool oneMinusOne = true, bool sortLabels = false, bool append = false) {
130  std::size_t elements = dataset.numberOfElements();
131  std::ofstream ofs;
132 
133  // shall we append only or overwrite?
134  if (append == true) {
135  ofs.open (fn.c_str(), std::fstream::out | std::fstream::app );
136  } else {
137  ofs.open (fn.c_str());
138  }
139 
140  if( !ofs ) {
141  throw( SHARKEXCEPTION( "[exportSparseData] file can not be opened for writing" ) );
142  }
143 
144  size_t dim = inputDimension(dataset);
145  if(numberOfClasses(dataset)!=2) oneMinusOne = false;
146 
147  std::vector<detail::LabelSortPair> L;
148  if(sortLabels) {
149  for(std::size_t i = 0; i < elements; i++)
150  L.push_back(detail::LabelSortPair(dataset.element(i).label, i));
151  std::sort (L.begin(), L.end(), detail::cmpLabelSortPair);
152  }
153 
154  for(std::size_t ii = 0; ii < elements; ii++) {
155  // apply mapping to sorted indices
156  std::size_t i = 0;
157  if(sortLabels) i = L[ii].second;
158  else i = ii;
159  // apply transformation to label and write it to file
160  if(oneMinusOne) ofs << 2*int(dataset.element(i).label)-1 << " ";
161  //libsvm file format documentation is scarce, but by convention the first class seems to be 1..
162  else ofs << dataset.element(i).label+1 << " ";
163  // write input data to file
164  for(std::size_t j=0; j<dim; j++) {
165  if(dense)
166  ofs << " " << j+1 << ":" <<dataset.element(i).input(j);
167  else if(dataset.element(i).input(j) != 0)
168  ofs << " " << j+1 << ":" << dataset.element(i).input(j);
169  }
170  ofs << std::endl;
171  }
172 }
173 
174 /** @}*/
175 
176 }
177 #endif