//===========================================================================
/*!
*
*
* \brief Support for downloading data sets from online sources.
*
*
* \par
* The methods in this file allow to download data sets from the
* mldata.org repository and other sources.
*
*
*
*
* \author T. Glasmachers
* \date 2016
*
*
* \par Copyright 1995-2017 Shark Development Team
*
*
* This file is part of Shark.
*
*
* Shark is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Shark is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Shark. If not, see .
*
*/
//===========================================================================
#ifndef SHARK_DATA_DOWNLOAD_H
#define SHARK_DATA_DOWNLOAD_H
#include
#include
#include
#include
#include
namespace shark {
/**
* \ingroup shark_globals
*
* @{
*/
/// \brief Split a URL into its domain and resource parts.
///
/// Returns a std::pair where the first element is the domain and the second is
/// the resource. With std::tie you can do pattern-matching:
/// std::tie(domain, resource) = splitUrl(url);
/// will fill the std::string variables domain and resource.
SHARK_EXPORT_SYMBOL std::pair splitUrl(std::string const & url);
/// \brief Download a document with the HTTP protocol.
///
/// \param url download URL, for example "www.shark-ml.org/index.html"
/// \param port TCP/IP port, defaults to 80
///
/// The function requests the document with a HTTP request and returns
/// the body of the corresponding HTTP reply. In case of success this
/// is the requested document. In case of an error the function throws
/// an exception. Note that the function does not perform standard
/// actions of web browsers, e.g., execute javascript or follow http
/// redirects. All HTTP response status codes other than 200 are
/// reported as failure to download the document and trigger an
/// exception.
SHARK_EXPORT_SYMBOL std::string download(std::string const& url, unsigned short port = 80);
/// \brief Download and import a sparse data (libSVM) file.
///
/// \param dataset container storing the loaded data
/// \param url http URL
/// \param port TCP/IP port, default is 80
/// \param highestIndex highest feature index, or 0 for auto-detection
/// \param batchSize size of batch
template void downloadSparseData(
LabeledData& dataset,
std::string const& url,
unsigned short port = 80,
unsigned int highestIndex = 0,
std::size_t batchSize = LabeledData::DefaultBatchSize
)
{
std::string content = download(url, port);
std::stringstream ss(content);
importSparseData(dataset, ss, highestIndex, batchSize);
}
/// \brief Download a data set from mldata.org.
///
/// \param dataset container storing the loaded data
/// \param name data set name
/// \param batchSize size of batch
template void downloadFromMLData(
LabeledData& dataset,
std::string const& name,
std::size_t batchSize = LabeledData::DefaultBatchSize
)
{
std::string filename;
for (char c : name)
{
if (c == ' ') c = '-';
else if (c >= 'A' && c <= 'Z') c += 32;
else if (c == '[' || c == '(' || c == ')' || c == '.' || c == ']') continue;
filename += c;
}
downloadSparseData(dataset, "mldata.org/repository/data/download/libsvm/" + filename + "/", 80, 0, batchSize);
}
/// \brief Download and import a dense data (CSV) file for classification.
///
/// \param dataset container storing the loaded data
/// \param url http URL
/// \param lp Position of the label in the record, either first or last column
/// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
/// \param comment Trailing character indicating comment line. By dfault it is '#'
/// \param port TCP/IP port, default is 80
/// \param maximumBatchSize size of batches in the dataset
template void downloadCsvData(
LabeledData& dataset,
std::string const& url,
LabelPosition lp,
char separator = ',',
char comment = '#',
unsigned short port = 80,
std::size_t maximumBatchSize = LabeledData::DefaultBatchSize
)
{
std::string content = download(url, port);
csvStringToData(dataset, content, lp, separator, comment, maximumBatchSize);
}
/// \brief Download and import a dense data (CSV) file for regression.
///
/// \param dataset container storing the loaded data
/// \param url http URL
/// \param lp Position of the label in the record, either first or last column
/// \param numberOfOutputs dimensionality of the labels
/// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
/// \param comment Trailing character indicating comment line. By dfault it is '#'
/// \param port TCP/IP port, default is 80
/// \param maximumBatchSize size of batches in the dataset
template void downloadCsvData(
LabeledData& dataset,
std::string const& url,
LabelPosition lp,
std::size_t numberOfOutputs = 1,
char separator = ',',
char comment = '#',
unsigned short port = 80,
std::size_t maximumBatchSize = LabeledData::DefaultBatchSize
)
{
std::string content = download(url, port);
csvStringToData(dataset, content, lp, numberOfOutputs, separator, comment, maximumBatchSize);
}
/** @}*/
}
#endif