//===========================================================================
/*!
*
*
* \brief Support for importing and exporting data from and to character separated value (CSV) files
*
*
* \par
* The most important application of the methods provided in this
* file is the import of data from CSV files into Shark data
* containers.
*
*
*
*
* \author T. Voss, M. Tuma
* \date 2010
*
*
* \par Copyright 1995-2017 Shark Development Team
*
*
* This file is part of Shark.
*
*
* Shark is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Shark is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Shark. If not, see .
*
*/
//===========================================================================
#ifndef SHARK_DATA_CSV_H
#define SHARK_DATA_CSV_H
#include
#include
#include
#include
namespace shark {
/**
* \ingroup shark_globals
*
* @{
*/
/// \brief Position of the label in a CSV file
///
/// \par
/// This type describes the position of the label in a record of a CSV file.
/// The label can be positioned either in the first or the last column, or
/// there can be no label present at all.
enum LabelPosition {
FIRST_COLUMN,
LAST_COLUMN,
};
namespace detail {
// export function for unlabeled data
template
void exportCSV(const T &data, // Container that holds the samples
Stream &out, // The file to be read from
char separator, // The separator between elements
bool scientific = true, //scientific notation?
unsigned int fieldwidth = 0
) {
SHARK_RUNTIME_CHECK(out, "Stream cannot be opened for writing.");
// set output format
if (scientific)
out.setf(std::ios_base::scientific);
std::streamsize ss = out.precision();
out.precision(10);
// write out
typename T::const_iterator it = data.begin();
for (; it != data.end(); ++it) {
SHARK_RUNTIME_CHECK(it->begin() != it->end(), "Record must not be empty");
for (std::size_t i=0; i<(*it).size()-1; i++) {
out << std::setw(fieldwidth) << (*it)(i) << separator;
}
out << std::setw(fieldwidth) << (*it)((*it).size()-1) << std::endl;
}
// restore output format
out.precision(ss);
}
// export function for labeled data
template
void exportCSV_labeled(const T &input, // Container that holds the samples
const U &labels, // Container that holds the labels
Stream &out, // The file to be read from
LabelPosition lp, // The position of the label
char separator, // The separator between elements
bool scientific = true, //scientific notation?
unsigned int fieldwidth = 0, //column-align using this field width
typename boost::enable_if<
std::is_arithmetic::type>
>::type* dummy = 0//enable this only for arithmetic types
) {
SHARK_RUNTIME_CHECK(out, "Stream cannot be opened for writing.");
if (scientific)
out.setf(std::ios_base::scientific);
std::streamsize ss = out.precision();
out.precision(10);
typename T::const_iterator iti = input.begin();
typename U::const_iterator itl = labels.begin();
for (; iti != input.end(); ++iti, ++itl) {
SHARK_RUNTIME_CHECK(iti->begin() != iti->end(), "Record must not be empty");
if (lp == FIRST_COLUMN)
out << *itl << separator;
for (std::size_t i=0; i<(*iti).size()-1; i++) {
out << std::setw(fieldwidth) << (*iti)(i) << separator;
}
if (lp == FIRST_COLUMN) {
out << std::setw(fieldwidth) << (*iti)((*iti).size()-1) << std::endl;
} else {
out << std::setw(fieldwidth) << (*iti)((*iti).size()-1) << separator << *itl << std::endl;
}
}
out.precision(ss);
}
// export function for data with vector labels
template
void exportCSV_labeled(
const T &input, // Container that holds the samples
const U &labels, // Container that holds the labels
Stream &out, // The file to be read from
LabelPosition lp, // The position of the label
char separator, // The separator between elements
bool scientific = true, //scientific notation?
unsigned int fieldwidth = 0, //column-align using this field width
typename boost::disable_if<
std::is_arithmetic::type>
>::type* dummy = 0//enable this only for complex types
) {
SHARK_RUNTIME_CHECK(out, "Stream cannot be opened for writing.");
if (scientific)
out.setf(std::ios_base::scientific);
std::streamsize ss = out.precision();
out.precision(10);
typename T::const_iterator iti = input.begin();
typename U::const_iterator itl = labels.begin();
for (; iti != input.end(); ++iti, ++itl) {
SHARK_RUNTIME_CHECK(iti->begin() != iti->end(), "[exportCSV (2)] record must not be empty");
if (lp == FIRST_COLUMN) {
for (std::size_t j = 0; j < itl->size(); j++)
out << std::setw(fieldwidth) << (*itl)(j) << separator;
}
for (std::size_t i=0; i<(*iti).size()-1; i++) {
out << std::setw(fieldwidth) << (*iti)(i) << separator;
}
if (lp == FIRST_COLUMN) {
out << std::setw(fieldwidth) << (*iti)((*iti).size()-1) << std::endl;
} else {
out << std::setw(fieldwidth) << (*iti)((*iti).size()-1);
for (std::size_t j = 0; j < itl->size(); j++)
out << std::setw(fieldwidth) << separator << (*itl)(j);
out << std::endl;
}
}
out.precision(ss);
}
} // namespace detail
// ACTUAL READ IN ROUTINES BELOW
/// \brief Import unlabeled vectors from a read-in character-separated value file.
///
/// \param data Container storing the loaded data
/// \param contents The read in csv-file
/// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
/// \param comment Trailing character indicating comment line. By dfault it is '#'
/// \param maximumBatchSize Size of batches in the dataset
SHARK_EXPORT_SYMBOL void csvStringToData(
Data &data,
std::string const& contents,
char separator = ',',
char comment = '#',
std::size_t maximumBatchSize = Data::DefaultBatchSize
);
/// \brief Import unlabeled vectors from a read-in character-separated value file.
///
/// \param data Container storing the loaded data
/// \param contents The read in csv-file
/// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
/// \param comment Trailing character indicating comment line. By dfault it is '#'
/// \param maximumBatchSize Size of batches in the dataset
SHARK_EXPORT_SYMBOL void csvStringToData(
Data &data,
std::string const& contents,
char separator = ',',
char comment = '#',
std::size_t maximumBatchSize = Data::DefaultBatchSize
);
/// \brief Import "csv" from string consisting only of a single unsigned int per row
///
/// \param data Container storing the loaded data
/// \param contents The read in csv-file
/// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
/// \param comment Trailing characters indicating comment line. By default it is "#"
/// \param maximumBatchSize Size of batches in the dataset
SHARK_EXPORT_SYMBOL void csvStringToData(
Data &data,
std::string const& contents,
char separator = ',',
char comment = '#',
std::size_t maximumBatchSize = Data::DefaultBatchSize
);
/// \brief Import "csv" from string consisting only of a single int per row
///
/// \param data Container storing the loaded data
/// \param contents The read in csv-file
/// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
/// \param comment Trailing characters indicating comment line. By default it is "#"
/// \param maximumBatchSize Size of batches in the dataset
SHARK_EXPORT_SYMBOL void csvStringToData(
Data &data,
std::string const& contents,
char separator = ',',
char comment = '#',
std::size_t maximumBatchSize = Data::DefaultBatchSize
);
/// \brief Import "csv" from string consisting only of a single double per row
///
/// \param data Container storing the loaded data
/// \param contents The read in csv-file
/// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
/// \param comment Trailing characters indicating comment line. By default it is "#"
/// \param maximumBatchSize Size of batches in the dataset
SHARK_EXPORT_SYMBOL void csvStringToData(
Data &data,
std::string const& contents,
char separator = ',',
char comment = '#',
std::size_t maximumBatchSize = Data::DefaultBatchSize
);
/// \brief Import "csv" from string consisting only of a single double per row
///
/// \param data Container storing the loaded data
/// \param contents The read in csv-file
/// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
/// \param comment Trailing characters indicating comment line. By default it is "#"
/// \param maximumBatchSize Size of batches in the dataset
SHARK_EXPORT_SYMBOL void csvStringToData(
Data &data,
std::string const& contents,
char separator = ',',
char comment = '#',
std::size_t maximumBatchSize = Data::DefaultBatchSize
);
/// \brief Import labeled data from a character-separated value file.
///
/// \param dataset Container storing the loaded data
/// \param contents the read-in file contents.
/// \param lp Position of the label in the record, either first or last column
/// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
/// \param comment Character for indicating a comment, by default '#'
/// \param maximumBatchSize maximum size of a batch in the dataset after import
SHARK_EXPORT_SYMBOL void csvStringToData(
LabeledData &dataset,
std::string const& contents,
LabelPosition lp,
char separator = ',',
char comment = '#',
std::size_t maximumBatchSize = LabeledData::DefaultBatchSize
);
/// \brief Import labeled data from a character-separated value file.
///
/// \param dataset Container storing the loaded data
/// \param contents the read-in file contents.
/// \param lp Position of the label in the record, either first or last column
/// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
/// \param comment Character for indicating a comment, by default '#'
/// \param maximumBatchSize maximum size of a batch in the dataset after import
SHARK_EXPORT_SYMBOL void csvStringToData(
LabeledData &dataset,
std::string const& contents,
LabelPosition lp,
char separator = ',',
char comment = '#',
std::size_t maximumBatchSize = LabeledData::DefaultBatchSize
);
/// \brief Import regression data from a read-in character-separated value file.
///
/// \param dataset Container storing the loaded data
/// \param contents The read in csv-file
/// \param lp Position of the label in the record, either first or last column
/// \param separator Separator between entries, typically a comma or a space
/// \param comment Character for indicating a comment, by default empty
/// \param numberOfOutputs Dimensionality of label/output
/// \param maximumBatchSize maximum size of a batch in the dataset after import
SHARK_EXPORT_SYMBOL void csvStringToData(
LabeledData &dataset,
std::string const& contents,
LabelPosition lp,
std::size_t numberOfOutputs = 1,
char separator = ',',
char comment = '#',
std::size_t maximumBatchSize = LabeledData::DefaultBatchSize
);
/// \brief Import regression data from a read-in character-separated value file.
///
/// \param dataset Container storing the loaded data
/// \param contents The read in csv-file
/// \param lp Position of the label in the record, either first or last column
/// \param separator Separator between entries, typically a comma or a space
/// \param comment Character for indicating a comment, by default empty
/// \param numberOfOutputs Dimensionality of label/output
/// \param maximumBatchSize maximum size of a batch in the dataset after import
SHARK_EXPORT_SYMBOL void csvStringToData(
LabeledData &dataset,
std::string const& contents,
LabelPosition lp,
std::size_t numberOfOutputs = 1,
char separator = ',',
char comment = '#',
std::size_t maximumBatchSize = LabeledData::DefaultBatchSize
);
/// \brief Import a Dataset from a csv file
///
/// \param data Container storing the loaded data
/// \param fn The file to be read from
/// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
/// \param comment Trailing character indicating comment line. By dfault it is '#'
/// \param maximumBatchSize Size of batches in the dataset
/// \param titleLines Specifies a number of lines to be skipped in the beginning of the file
template
void importCSV(
Data& data,
std::string fn,
char separator = ',',
char comment = '#',
std::size_t maximumBatchSize = Data::DefaultBatchSize,
std::size_t titleLines = 0
){
std::ifstream stream(fn.c_str());
SHARK_RUNTIME_CHECK(stream, "Stream cannot be opened for reading.");
stream.unsetf(std::ios::skipws);
for(std::size_t i=0; i < titleLines; ++i) // ignoring the first lines
stream.ignore(std::numeric_limits::max(), '\n');
std::istream_iterator streamBegin(stream);
std::string contents(//read contents of file in string
streamBegin,
std::istream_iterator()
);
//call the actual parser
csvStringToData(data,contents,separator,comment,maximumBatchSize);
}
/// \brief Import a labeled Dataset from a csv file
///
/// \param data Container storing the loaded data
/// \param fn The file to be read from
/// \param lp Position of the label in the record, either first or last column
/// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
/// \param comment Trailing character indicating comment line. By dfault it is '#'
/// \param maximumBatchSize Size of batches in the dataset
template
void importCSV(
LabeledData, unsigned int>& data,
std::string fn,
LabelPosition lp,
char separator = ',',
char comment = '#',
std::size_t maximumBatchSize = LabeledData::DefaultBatchSize
){
std::ifstream stream(fn.c_str());
SHARK_RUNTIME_CHECK(stream, "Stream cannot be opened for reading.");
stream.unsetf(std::ios::skipws);
std::istream_iterator streamBegin(stream);
std::string contents(//read contents of file in string
streamBegin,
std::istream_iterator()
);
//call the actual parser
csvStringToData(data,contents,lp,separator,comment,maximumBatchSize);
}
/// \brief Import a labeled Dataset from a csv file
///
/// \param data Container storing the loaded data
/// \param fn The file to be read from
/// \param lp Position of the label in the record, either first or last column
/// \param numberOfOutputs dimensionality of the labels
/// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
/// \param comment Trailing character indicating comment line. By dfault it is '#'
/// \param maximumBatchSize Size of batches in the dataset
template
void importCSV(
LabeledData, blas::vector >& data,
std::string fn,
LabelPosition lp,
std::size_t numberOfOutputs = 1,
char separator = ',',
char comment = '#',
std::size_t maximumBatchSize = LabeledData::DefaultBatchSize
){
std::ifstream stream(fn.c_str());
SHARK_RUNTIME_CHECK(stream, "Stream cannot be opened for reading.");
stream.unsetf(std::ios::skipws);
std::istream_iterator streamBegin(stream);
std::string contents(//read contents of file in string
streamBegin,
std::istream_iterator()
);
//call the actual parser
csvStringToData(data,contents,lp, numberOfOutputs, separator,comment,maximumBatchSize);
}
/// \brief Format unlabeled data into a character-separated value file.
///
/// \param set Container to be exported
/// \param fn The file to be written to
/// \param separator Separator between entries, typically a comma or a space
/// \param sci should the output be in scientific notation?
/// \param width argument to std::setw when writing the output
template
void exportCSV(
Data const& set,
std::string fn,
char separator = ',',
bool sci = true,
unsigned int width = 0
) {
std::ofstream ofs(fn.c_str());
SHARK_RUNTIME_CHECK(ofs, "Stream cannot be opened for writing.");
detail::exportCSV(set.elements(), ofs, separator, sci, width);
}
/// \brief Format labeled data into a character-separated value file.
///
/// \param dataset Container to be exported
/// \param fn The file to be written to
/// \param lp Position of the label in the record, either first or last column
/// \param separator Separator between entries, typically a comma or a space
/// \param sci should the output be in scientific notation?
/// \param width argument to std::setw when writing the output
template
void exportCSV(
LabeledData const &dataset,
std::string fn,
LabelPosition lp,
char separator = ',',
bool sci = true,
unsigned int width = 0
) {
std::ofstream ofs(fn.c_str());
SHARK_RUNTIME_CHECK(ofs, "Stream cannot be opened for writing.");
detail::exportCSV_labeled(dataset.inputs().elements(), dataset.labels().elements(), ofs, lp, separator, sci, width);
}
/** @}*/
} // namespace shark
#endif // SHARK_ML_CSV_H