Csv.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Support for importing and exporting data from and to character separated value (CSV) files
6  *
7  *
8  * \par
9  * The most important application of the methods provided in this
10  * file is the import of data from CSV files into Shark data
11  * containers.
12  *
13  *
14  *
15  *
16  * \author T. Voss, M. Tuma
17  * \date 2010
18  *
19  *
20  * \par Copyright 1995-2015 Shark Development Team
21  *
22  * <BR><HR>
23  * This file is part of Shark.
24  * <http://image.diku.dk/shark/>
25  *
26  * Shark is free software: you can redistribute it and/or modify
27  * it under the terms of the GNU Lesser General Public License as published
28  * by the Free Software Foundation, either version 3 of the License, or
29  * (at your option) any later version.
30  *
31  * Shark is distributed in the hope that it will be useful,
32  * but WITHOUT ANY WARRANTY; without even the implied warranty of
33  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34  * GNU Lesser General Public License for more details.
35  *
36  * You should have received a copy of the GNU Lesser General Public License
37  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
38  *
39  */
40 //===========================================================================
41 
42 #ifndef SHARK_DATA_CSV_H
43 #define SHARK_DATA_CSV_H
44 
45 #include <shark/Core/DLLSupport.h>
46 #include <shark/Data/Dataset.h>
47 
48 #include <boost/algorithm/string.hpp>
49 #include <boost/algorithm/string/trim.hpp>
50 #include <boost/format.hpp>
51 #include <boost/iostreams/filter/newline.hpp>
52 #include <boost/lexical_cast.hpp>
53 #include <boost/type_traits.hpp>
54 
55 #include <exception>
56 #include <fstream>
57 #include <map>
58 #include <string>
59 
60 namespace shark {
61 
62 /**
63  * \ingroup shark_globals
64  *
65  * @{
66  */
67 
68 
69 /// \brief Position of the label in a CSV file
70 ///
71 /// \par
72 /// This type describes the position of the label in a record of a CSV file.
73 /// The label can be positioned either in the first or the last column, or
74 /// there can be no label present at all.
78 };
79 
80 namespace detail {
81 
82  // export function for unlabeled data
83  template<typename T, typename Stream>
84  void exportCSV(const T &data, // Container that holds the samples
85  Stream &out, // The file to be read from
86  char separator, // The separator between elements
87  bool scientific = true, //scientific notation?
88  unsigned int fieldwidth = 0
89  ) {
90  if (!out) {
91  throw(std::invalid_argument("[exportCSV (1)] Stream cannot be opened for writing."));
92  }
93 
94  // set output format
95  if (scientific)
96  out.setf(std::ios_base::scientific);
97  std::streamsize ss = out.precision();
98  out.precision(10);
99 
100  // write out
101  typename T::const_iterator it = data.begin();
102  for (; it != data.end(); ++it) {
103  SHARK_CHECK(it->begin() != it->end(), "[exportCSV (1)] record must not be empty");
104  for (std::size_t i=0; i<(*it).size()-1; i++) {
105  out << std::setw(fieldwidth) << (*it)(i) << separator;
106  }
107  out << std::setw(fieldwidth) << (*it)((*it).size()-1) << std::endl;
108  }
109 
110  // restore output format
111  out.precision(ss);
112  }
113 
114  // export function for labeled data
115 
116  template<typename T, typename U, typename Stream>
117  void exportCSV_labeled(const T &input, // Container that holds the samples
118  const U &labels, // Container that holds the labels
119  Stream &out, // The file to be read from
120  LabelPosition lp, // The position of the label
121  char separator, // The separator between elements
122  bool scientific = true, //scientific notation?
123  unsigned int fieldwidth = 0, //column-align using this field width
124  typename boost::enable_if<
125  boost::is_arithmetic<typename boost::range_value<U>::type>
126  >::type* dummy = 0//enable this only for arithmetic types
127  ) {
128 
129  if (!out) {
130  throw(std::invalid_argument("[exportCSV (2)] Stream cannot be opened for writing."));
131  }
132 
133 
134  if (scientific)
135  out.setf(std::ios_base::scientific);
136  std::streamsize ss = out.precision();
137  out.precision(10);
138 
139  typename T::const_iterator iti = input.begin();
140  typename U::const_iterator itl = labels.begin();
141 
142 
143  for (; iti != input.end(); ++iti, ++itl) {
144  SHARK_CHECK(iti->begin() != iti->end(), "[exportCSV (2)] record must not be empty");
145  if (lp == FIRST_COLUMN)
146  out << *itl << separator;
147  for (std::size_t i=0; i<(*iti).size()-1; i++) {
148  out << std::setw(fieldwidth) << (*iti)(i) << separator;
149  }
150  if (lp == FIRST_COLUMN) {
151  out << std::setw(fieldwidth) << (*iti)((*iti).size()-1) << std::endl;
152  } else {
153  out << std::setw(fieldwidth) << (*iti)((*iti).size()-1) << separator << *itl << std::endl;
154  }
155  }
156  out.precision(ss);
157  }
158 
159  // export function for data with vector labels
160  template<typename T, typename U, typename Stream>
161  void exportCSV_labeled(
162  const T &input, // Container that holds the samples
163  const U &labels, // Container that holds the labels
164  Stream &out, // The file to be read from
165  LabelPosition lp, // The position of the label
166  char separator, // The separator between elements
167  bool scientific = true, //scientific notation?
168  unsigned int fieldwidth = 0, //column-align using this field width
169  typename boost::disable_if<
170  boost::is_arithmetic<typename boost::range_value<U>::type>
171  >::type* dummy = 0//enable this only for complex types
172  ) {
173 
174  if (!out) {
175  throw(std::invalid_argument("[exportCSV (2)] Stream cannot be opened for writing."));
176  }
177 
178 
179  if (scientific)
180  out.setf(std::ios_base::scientific);
181  std::streamsize ss = out.precision();
182  out.precision(10);
183 
184  typename T::const_iterator iti = input.begin();
185  typename U::const_iterator itl = labels.begin();
186 
187  for (; iti != input.end(); ++iti, ++itl) {
188  SHARK_CHECK(iti->begin() != iti->end(), "[exportCSV (2)] record must not be empty");
189  if (lp == FIRST_COLUMN) {
190  for (std::size_t j = 0; j < itl->size(); j++) out << std::setw(fieldwidth) << (*itl)(j) << separator;
191  }
192  for (std::size_t i=0; i<(*iti).size()-1; i++) {
193  out << std::setw(fieldwidth) << (*iti)(i) << separator;
194  }
195  if (lp == FIRST_COLUMN) {
196  out << std::setw(fieldwidth) << (*iti)((*iti).size()-1) << std::endl;
197  } else {
198  out << std::setw(fieldwidth) << (*iti)((*iti).size()-1);
199  for (std::size_t j = 0; j < itl->size(); j++) out << std::setw(fieldwidth) << separator << (*itl)(j);
200  out << std::endl;
201  }
202  }
203  out.precision(ss);
204  }
205 } // namespace detail
206 
207 
208 
209 // ACTUAL READ IN ROUTINES BELOW
210 
211 /// \brief Import unlabeled vectors from a read-in character-separated value file.
212 ///
213 /// \param data Container storing the loaded data
214 /// \param contents The read in csv-file
215 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
216 /// \param comment Trailing character indicating comment line. By dfault it is '#'
217 /// \param maximumBatchSize Size of batches in the dataset
219  Data<FloatVector> &data,
220  std::string const& contents,
221  char separator = ',',
222  char comment = '#',
223  std::size_t maximumBatchSize = Data<RealVector>::DefaultBatchSize
224 );
225 
226 /// \brief Import unlabeled vectors from a read-in character-separated value file.
227 ///
228 /// \param data Container storing the loaded data
229 /// \param contents The read in csv-file
230 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
231 /// \param comment Trailing character indicating comment line. By dfault it is '#'
232 /// \param maximumBatchSize Size of batches in the dataset
234  Data<RealVector> &data,
235  std::string const& contents,
236  char separator = ',',
237  char comment = '#',
238  std::size_t maximumBatchSize = Data<RealVector>::DefaultBatchSize
239 );
240 
241 /// \brief Import "csv" from string consisting only of a single unsigned int per row
242 ///
243 /// \param data Container storing the loaded data
244 /// \param contents The read in csv-file
245 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
246 /// \param comment Trailing characters indicating comment line. By default it is "#"
247 /// \param maximumBatchSize Size of batches in the dataset
249  Data<unsigned int> &data,
250  std::string const& contents,
251  char separator = ',',
252  char comment = '#',
253  std::size_t maximumBatchSize = Data<unsigned int>::DefaultBatchSize
254 );
255 
256 /// \brief Import "csv" from string consisting only of a single int per row
257 ///
258 /// \param data Container storing the loaded data
259 /// \param contents The read in csv-file
260 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
261 /// \param comment Trailing characters indicating comment line. By default it is "#"
262 /// \param maximumBatchSize Size of batches in the dataset
264  Data<int> &data,
265  std::string const& contents,
266  char separator = ',',
267  char comment = '#',
268  std::size_t maximumBatchSize = Data<int>::DefaultBatchSize
269 );
270 
271 /// \brief Import "csv" from string consisting only of a single double per row
272 ///
273 /// \param data Container storing the loaded data
274 /// \param contents The read in csv-file
275 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
276 /// \param comment Trailing characters indicating comment line. By default it is "#"
277 /// \param maximumBatchSize Size of batches in the dataset
279  Data<float> &data,
280  std::string const& contents,
281  char separator = ',',
282  char comment = '#',
283  std::size_t maximumBatchSize = Data<double>::DefaultBatchSize
284 );
285 
286 /// \brief Import "csv" from string consisting only of a single double per row
287 ///
288 /// \param data Container storing the loaded data
289 /// \param contents The read in csv-file
290 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
291 /// \param comment Trailing characters indicating comment line. By default it is "#"
292 /// \param maximumBatchSize Size of batches in the dataset
294  Data<double> &data,
295  std::string const& contents,
296  char separator = ',',
297  char comment = '#',
298  std::size_t maximumBatchSize = Data<double>::DefaultBatchSize
299 );
300 
301 /// \brief Import labeled data from a character-separated value file.
302 ///
303 /// \param dataset Container storing the loaded data
304 /// \param contents the read-in file contents.
305 /// \param lp Position of the label in the record, either first or last column
306 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
307 /// \param comment Character for indicating a comment, by default '#'
308 /// \param maximumBatchSize maximum size of a batch in the dataset after import
311  std::string const& contents,
312  LabelPosition lp,
313  char separator = ',',
314  char comment = '#',
315  std::size_t maximumBatchSize = LabeledData<RealVector, unsigned int>::DefaultBatchSize
316 );
317 
318 /// \brief Import labeled data from a character-separated value file.
319 ///
320 /// \param dataset Container storing the loaded data
321 /// \param contents the read-in file contents.
322 /// \param lp Position of the label in the record, either first or last column
323 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
324 /// \param comment Character for indicating a comment, by default '#'
325 /// \param maximumBatchSize maximum size of a batch in the dataset after import
328  std::string const& contents,
329  LabelPosition lp,
330  char separator = ',',
331  char comment = '#',
332  std::size_t maximumBatchSize = LabeledData<RealVector, unsigned int>::DefaultBatchSize
333 );
334 
335 
336 /// \brief Import regression data from a read-in character-separated value file.
337 ///
338 /// \param dataset Container storing the loaded data
339 /// \param contents The read in csv-file
340 /// \param lp Position of the label in the record, either first or last column
341 /// \param separator Separator between entries, typically a comma or a space
342 /// \param comment Character for indicating a comment, by default empty
343 /// \param numberOfOutputs Dimensionality of label/output
344 /// \param maximumBatchSize maximum size of a batch in the dataset after import
347  std::string const& contents,
348  LabelPosition lp,
349  std::size_t numberOfOutputs = 1,
350  char separator = ',',
351  char comment = '#',
352  std::size_t maximumBatchSize = LabeledData<RealVector, RealVector>::DefaultBatchSize
353 );
354 
355 /// \brief Import regression data from a read-in character-separated value file.
356 ///
357 /// \param dataset Container storing the loaded data
358 /// \param contents The read in csv-file
359 /// \param lp Position of the label in the record, either first or last column
360 /// \param separator Separator between entries, typically a comma or a space
361 /// \param comment Character for indicating a comment, by default empty
362 /// \param numberOfOutputs Dimensionality of label/output
363 /// \param maximumBatchSize maximum size of a batch in the dataset after import
366  std::string const& contents,
367  LabelPosition lp,
368  std::size_t numberOfOutputs = 1,
369  char separator = ',',
370  char comment = '#',
371  std::size_t maximumBatchSize = LabeledData<RealVector, RealVector>::DefaultBatchSize
372 );
373 
374 
375 
376 /// \brief Import a Dataset from a csv file
377 ///
378 /// \param data Container storing the loaded data
379 /// \param fn The file to be read from
380 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
381 /// \param comment Trailing character indicating comment line. By dfault it is '#'
382 /// \param maximumBatchSize Size of batches in the dataset
383 /// \param titleLines Specifies a number of lines to be skipped in the beginning of the file
384 template<class T>
386  Data<T>& data,
387  std::string fn,
388  char separator = ',',
389  char comment = '#',
390  std::size_t maximumBatchSize = Data<T>::DefaultBatchSize,
391  std::size_t titleLines = 0
392 ){
393  std::ifstream stream(fn.c_str());
394  stream.unsetf(std::ios::skipws);
395 
396  for(std::size_t i=0; i < titleLines; ++i) // ignoring the first lines
397  stream.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
398 
399  std::istream_iterator<char> streamBegin(stream);
400  std::string contents(//read contents of file in string
401  streamBegin,
402  std::istream_iterator<char>()
403  );
404  //call the actual parser
405  csvStringToData(data,contents,separator,comment,maximumBatchSize);
406 }
407 
408 /// \brief Import a labeled Dataset from a csv file
409 ///
410 /// \param data Container storing the loaded data
411 /// \param fn The file to be read from
412 /// \param lp Position of the label in the record, either first or last column
413 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
414 /// \param comment Trailing character indicating comment line. By dfault it is '#'
415 /// \param maximumBatchSize Size of batches in the dataset
416 template<class T>
418  LabeledData<blas::vector<T>, unsigned int>& data,
419  std::string fn,
420  LabelPosition lp,
421  char separator = ',',
422  char comment = '#',
423  std::size_t maximumBatchSize = LabeledData<RealVector, unsigned int>::DefaultBatchSize
424 ){
425  std::ifstream stream(fn.c_str());
426  stream.unsetf(std::ios::skipws);
427  std::istream_iterator<char> streamBegin(stream);
428  std::string contents(//read contents of file in string
429  streamBegin,
430  std::istream_iterator<char>()
431  );
432  //call the actual parser
433  csvStringToData(data,contents,lp,separator,comment,maximumBatchSize);
434 }
435 
436 /// \brief Import a labeled Dataset from a csv file
437 ///
438 /// \param data Container storing the loaded data
439 /// \param fn The file to be read from
440 /// \param lp Position of the label in the record, either first or last column
441 /// \param numberOfOutputs dimensionality of the labels
442 /// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
443 /// \param comment Trailing character indicating comment line. By dfault it is '#'
444 /// \param maximumBatchSize Size of batches in the dataset
445 template<class T>
448  std::string fn,
449  LabelPosition lp,
450  std::size_t numberOfOutputs = 1,
451  char separator = ',',
452  char comment = '#',
453  std::size_t maximumBatchSize = LabeledData<RealVector, RealVector>::DefaultBatchSize
454 ){
455  std::ifstream stream(fn.c_str());
456  stream.unsetf(std::ios::skipws);
457  std::istream_iterator<char> streamBegin(stream);
458  std::string contents(//read contents of file in string
459  streamBegin,
460  std::istream_iterator<char>()
461  );
462  //call the actual parser
463  csvStringToData(data,contents,lp, numberOfOutputs, separator,comment,maximumBatchSize);
464 }
465 
466 /// \brief Format unlabeled data into a character-separated value file.
467 ///
468 /// \param set Container to be exported
469 /// \param fn The file to be written to
470 /// \param separator Separator between entries, typically a comma or a space
471 /// \param sci should the output be in scientific notation?
472 /// \param width argument to std::setw when writing the output
473 template<typename Type>
475  Data<Type> const& set,
476  std::string fn,
477  char separator = ',',
478  bool sci = true,
479  unsigned int width = 0
480 ) {
481  std::ofstream ofs(fn.c_str());
482  detail::exportCSV(set.elements(), ofs, separator, sci, width);
483 }
484 
485 
486 /// \brief Format labeled data into a character-separated value file.
487 ///
488 /// \param dataset Container to be exported
489 /// \param fn The file to be written to
490 /// \param lp Position of the label in the record, either first or last column
491 /// \param separator Separator between entries, typically a comma or a space
492 /// \param sci should the output be in scientific notation?
493 /// \param width argument to std::setw when writing the output
494 template<typename InputType, typename LabelType>
496  LabeledData<InputType, LabelType> const &dataset,
497  std::string fn,
498  LabelPosition lp,
499  char separator = ',',
500  bool sci = true,
501  unsigned int width = 0
502 ) {
503  std::ofstream ofs(fn.c_str());
504  detail::exportCSV_labeled(dataset.inputs().elements(), dataset.labels().elements(), ofs, lp, separator, sci, width);
505 }
506 
507 
508 /** @}*/
509 
510 } // namespace shark
511 #endif // SHARK_ML_CSV_H