//===========================================================================
/*!
*
*
* \brief Data for (un-)supervised learning.
*
*
* \par
* This file provides containers for data used by the models, loss
* functions, and learning algorithms (trainers). The reason for
* dedicated containers of this type is that data often need to be
* split into subsets, such as training and test data, or folds in
* cross-validation. The containers in this file provide memory
* efficient mechanisms for managing and providing such subsets.
*
*
*
*
* \author O. Krause, T. Glasmachers
* \date 2010-2014
*
*
* \par Copyright 1995-2017 Shark Development Team
*
*
* This file is part of Shark.
*
*
* Shark is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Shark is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Shark. If not, see .
*
*/
//===========================================================================
#ifndef SHARK_DATA_DATASET_H
#define SHARK_DATA_DATASET_H
#include
#include
#include
#include
#include
#include
#include
#include "Impl/Dataset.inl"
namespace shark {
///
/// \brief Data container.
///
/// The Data class is Shark's container for machine learning data.
/// This container (and its sub-classes) is used for input data,
/// labels, and model outputs.
///
/// \par
/// The Data container organizes the data it holds in batches.
/// This means, that it tries to find a good data representation for a whole
/// set of, for example 100 data points, at the same time. If the type of data it stores
/// is for example RealVector, the batches of this type are RealMatrices. This is good because most often
/// operations on the whole matrix are faster than operations on the separate vectors.
/// Nearly all operations of the set have to be interpreted in terms of the batch. Therefore the iterator interface will
/// give access to the batches but not to single elements. For this separate element_iterators and const_element_iterators
/// can be used.
///\par
///When you need to explicitely iterate over all elements, you can use:
///\code
/// Data data;
/// for(auto elem: data.elements()){
/// std::cout<<*pos<<" ";
/// auto ref=*pos;
/// ref*=2;
/// std::cout<<*pos<RealVector or Int->IntVector).
/// For vectors the results are matrices as mentioned above. If the vector is sparse, so is the matrix.
/// And for everything else the batch type is just a std::vector of the type, so no optimization can be applied.
/// \par
/// When constructing the container the batchSize can be set. If it is not set by the user the default batchSize is chosen. A BatchSize of 0
/// corresponds to putting all data into a single batch. Beware that not only the data needs storage but also
/// the various models during computation. So the actual amount of space to compute a batch can greatly exceed the batch size.
///
/// An additional feature of the Data class is that it can be used to create lazy subsets. So the batches of a dataset
/// can be shared between various instances of the data class without additional memory overhead.
///
///
///\warning Be aware --especially for derived containers like LabeledData-- that the set does not enforce structural consistency.
/// When you change the structure of the data part for example by directly changing the size of the batches, the size of the labels is not
/// enforced to change accordingly. Also when creating subsets of a set changing the parent will change it's siblings and conversely. The programmer
/// needs to ensure structural integrity!
/// For example this is dangerous:
/// \code
/// void function(Data& data){
/// Data newData(...);
/// data=newData;
/// }
/// \endcode
/// When data was originally a labeledData object, and newData has a different batch structure than data, this will lead to structural inconsistencies.
/// When function is rewritten such that newData has the same structure as data, this code is perfectly fine. The best way to get around this problem is
/// by rewriting the code as:
/// \code
/// Data function(){
/// Data newData(...);
/// return newData;
/// }
/// \endcode
///\todo expand docu
template
class Data : public ISerializable
{
protected:
typedef detail::SharedContainer Container;
Container m_data;///< data
Shape m_shape;///< shape of a datapoint
public:
/// \brief Defines the default batch size of the Container.
///
/// Zero means: unlimited
BOOST_STATIC_CONSTANT(std::size_t, DefaultBatchSize = 256);
typedef typename Container::BatchType batch_type;
typedef batch_type& batch_reference;
typedef batch_type const& const_batch_reference;
typedef Type element_type;
typedef typename Batch::reference element_reference;
typedef typename Batch::const_reference const_element_reference;
typedef std::vector IndexSet;
/// \brief Two containers compare equal if they share the same data.
template bool operator == (const Data& rhs) {
return (m_data == rhs.m_data);
}
/// \brief Two containers compare unequal if they don't share the same data.
template bool operator != (const Data& rhs) {
return (! (*this == rhs));
}
template friend class LabeledData;
// RANGES
typedef boost::iterator_range< detail::DataElementIterator > > element_range;
typedef boost::iterator_range< detail::DataElementIterator const> > const_element_range;
typedef detail::BatchRange > batch_range;
typedef detail::BatchRange const> const_batch_range;
///\brief Returns the range of elements.
///
///It is compatible to boost::range and STL and can be used whenever an algorithm requires
///element access via begin()/end() in which case data.elements() provides the correct interface
const_element_range elements()const{
return const_element_range(
detail::DataElementIterator const>(this,0,0,0),
detail::DataElementIterator const>(this,numberOfBatches(),0,numberOfElements())
);
}
///\brief Returns therange of elements.
///
///It is compatible to boost::range and STL and can be used whenever an algorithm requires
///element access via begin()/end() in which case data.elements() provides the correct interface
element_range elements(){
return element_range(
detail::DataElementIterator >(this,0,0,0),
detail::DataElementIterator >(this,numberOfBatches(),0,numberOfElements())
);
}
///\brief Returns the range of batches.
///
///It is compatible to boost::range and STL and can be used whenever an algorithm requires
///element access via begin()/end() in which case data.elements() provides the correct interface
const_batch_range batches()const{
return const_batch_range(this);
}
///\brief Returns the range of batches.
///
///It is compatible to boost::range and STL and can be used whenever an algorithm requires
///element access via begin()/end() in which case data.elements() provides the correct interface
batch_range batches(){
return batch_range(this);
}
///\brief Returns the number of batches of the set.
std::size_t numberOfBatches() const{
return m_data.size();
}
///\brief Returns the total number of elements.
std::size_t numberOfElements() const{
return m_data.numberOfElements();
}
///\brief Returns the shape of the elements in the dataset.
Shape const& shape() const{
return m_shape;
}
///\brief Returns the shape of the elements in the dataset.
Shape& shape(){
return m_shape;
}
///\brief Check whether the set is empty.
bool empty() const{
return m_data.empty();
}
// ELEMENT ACCESS
element_reference element(std::size_t i){
return *(detail::DataElementIterator >(this,0,0,0)+i);
}
const_element_reference element(std::size_t i) const{
return *(detail::DataElementIterator const>(this,0,0,0)+i);
}
// BATCH ACCESS
batch_reference batch(std::size_t i){
return *(m_data.begin()+i);
}
const_batch_reference batch(std::size_t i) const{
return *(m_data.begin()+i);
}
// CONSTRUCTORS
///\brief Constructor which constructs an empty set
Data(){ }
///\brief Construct a dataset with empty batches.
explicit Data(std::size_t numBatches) : m_data( numBatches )
{ }
///\brief Construction with size and a single element
///
/// Optionally the desired batch Size can be set
///
///@param size the new size of the container
///@param element the blueprint element from which to create the Container
///@param batchSize the size of the batches. if this is 0, the size is unlimited
explicit Data(std::size_t size, element_type const& element, std::size_t batchSize = DefaultBatchSize)
: m_data(size,element,batchSize)
{ }
// MISC
void read(InArchive& archive){
archive >> m_data;
archive >> m_shape;
}
void write(OutArchive& archive) const{
archive << m_data;
archive << m_shape;
}
///\brief This method makes the vector independent of all siblings and parents.
virtual void makeIndependent(){
m_data.makeIndependent();
}
// METHODS TO ALTER BATCH STRUCTURE
void splitBatch(std::size_t batch, std::size_t elementIndex){
m_data.splitBatch(m_data.begin()+batch,elementIndex);
}
///\brief Splits the container into two independent parts. The front part remains in the container, the back part is returned.
///
///Order of elements remain unchanged. The SharedVector is not allowed to be shared for
///this to work.
Data splice(std::size_t batch){
Data right;
right.m_data = m_data.splice(m_data.begin()+batch);
right.m_shape = m_shape;
return right;
}
/// \brief Appends the contents of another data object to the end
///
/// The batches are not copied but now referenced from both datasets. Thus changing the appended
/// dataset might change this one as well.
void append(Data const& other){
m_data.append(other.m_data);
}
void push_back(const_batch_reference batch){
m_data.push_back(batch);
}
///\brief Reorders the batch structure in the container to that indicated by the batchSizes vector
///
///After the operation the container will contain batchSizes.size() batchs with the i-th batch having size batchSize[i].
///However the sum of all batch sizes must be equal to the current number of elements
template
void repartition(Range const& batchSizes){
m_data.repartition(batchSizes);
}
/// \brief Creates a vector with the batch sizes of every batch.
///
/// This method can be used together with repartition to ensure
/// that two datasets have the same batch structure.
std::vector getPartitioning()const{
return m_data.getPartitioning();
}
/// \brief Reorders elements across batches
///
/// Takes a vector of indices so that the ith element is moved to index[i].
/// This will create a temporary copy of the dataset and thus requires a double amount of memory compared to the original dataset
/// during construction.
template
void reorderElements(Range const& indices){
Data dataCopy(numberOfBatches());
dataCopy.shape() = shape();
std::vector batch_elements;
auto indexPos = indices.begin();
auto elemBegin = elements().begin();
for(std::size_t b = 0; b != numberOfBatches(); ++b){
std::size_t numElements = batchSize(batch(b));
batch_elements.clear();
for(std::size_t i = 0; i != numElements; ++i,++indexPos){
batch_elements.push_back(*(elemBegin+*indexPos));
}
dataCopy.batch(b) = createBatch(batch_elements);
}
*this = dataCopy;
}
// SUBSETS
///\brief Fill in the subset defined by the list of indices as well as its complement.
void indexedSubset(IndexSet const& indices, Data& subset, Data& complement) const{
IndexSet comp;
detail::complement(indices,m_data.size(),comp);
subset.m_data=Container(m_data,indices);
complement.m_data=Container(m_data,comp);
}
Data indexedSubset(IndexSet const& indices) const{
Data subset;
subset.m_data = Container(m_data,indices);
subset.m_shape = m_shape;
return subset;
}
friend void swap(Data& a, Data& b){
swap(a.m_data,b.m_data);
std::swap(a.m_shape,b.m_shape);
}
};
/**
* \ingroup shark_globals
* @{
*/
/// Outstream of elements.
template
std::ostream &operator << (std::ostream &stream, const Data& d) {
for(auto elem:d.elements())
stream << elem << "\n";
return stream;
}
/** @} */
/// \brief Data set for unsupervised learning.
///
/// The UnlabeledData class is basically a standard Data container
/// with the special interpretation of its data point being
/// "inputs" to a learning algorithm.
template
class UnlabeledData : public Data
{
public:
typedef InputT element_type;
typedef Data base_type;
typedef element_type InputType;
typedef detail::SharedContainer InputContainer;
protected:
using base_type::m_data;
public:
///\brief Constructor.
UnlabeledData()
{ }
///\brief Construction from data.
UnlabeledData(Data const& points)
: base_type(points)
{ }
///\brief Construction with size and a single element
///
/// Optionally the desired batch Size can be set
///
///@param size the new size of the container
///@param element the blueprint element from which to create the Container
///@param batchSize the size of the batches. if this is 0, the size is unlimited
UnlabeledData(std::size_t size, element_type const& element, std::size_t batchSize = base_type::DefaultBatchSize)
: base_type(size,element,batchSize)
{ }
///\brief Create an empty set with just the correct number of batches.
///
/// The user must initialize the dataset after that by himself.
UnlabeledData(std::size_t numBatches)
: base_type(numBatches)
{ }
///\brief Construct a dataset with different batch sizes. it is a copy of the other dataset
UnlabeledData(UnlabeledData const& container, std::vector batchSizes)
:base_type(container,batchSizes){}
/// \brief we allow assignment from Data.
UnlabeledData operator=(Data const& data){
static_cast& >(*this) = data;
return *this;
}
///\brief Access to the base_type class as "inputs".
///
/// Added for consistency with the LabeledData::labels() method.
UnlabeledData& inputs(){
return *this;
}
///\brief Access to the base_type class as "inputs".
///
/// Added for consistency with the LabeledData::labels() method.
UnlabeledData const& inputs() const{
return *this;
}
///\brief Splits the container in two independent parts. The left part remains in the container, the right is stored as return type
///
///Order of elements remain unchanged. The SharedVector is not allowed to be shared for
///this to work.
UnlabeledData splice(std::size_t batch){
UnlabeledData right;
right.m_data = m_data.splice(m_data.begin()+batch);
right.m_shape = this->m_shape;
return right;
}
///\brief shuffles all elements in the entire dataset (that is, also across the batches)
void shuffle(){
std::vector indices(this->numberOfElements());
std::iota(indices.begin(),indices.end(),0);
std::shuffle(indices.begin(),indices.end(), random::globalRng);
this->reorderElements(indices);
}
};
///
/// \brief Data set for supervised learning.
///
/// The LabeledData class extends UnlabeledData for the
/// representation of inputs. In addition it holds and
/// provides access to the corresponding labels.
///
/// LabeledData tries to mimic the underlying data as pairs of input and label data.
/// this means that when accessing a batch by calling batch(i) or choosing one of the iterators
/// one access the input batch by batch(i).input and the labels by batch(i).label
///
///this also holds true for single element access using operator(). Be aware, that direct access to an element is
///a linear time operation. So it is not advisable to iterate over the elements, but instead iterate over the batches.
template
class LabeledData : public ISerializable
{
public:
typedef InputT InputType;
typedef LabelT LabelType;
typedef UnlabeledData InputContainer;
typedef Data LabelContainer;
typedef typename InputContainer::IndexSet IndexSet;
static const std::size_t DefaultBatchSize = InputContainer::DefaultBatchSize;
// TYPEDEFS FOR PAIRS
typedef InputLabelBatch<
typename Batch::type,
typename Batch::type
> batch_type;
typedef InputLabelPair element_type;
// TYPEDEFS FOR REFERENCES
typedef InputLabelBatch<
typename Batch::type&,
typename Batch::type&
> batch_reference;
typedef InputLabelBatch<
typename Batch::type const&,
typename Batch::type const&
> const_batch_reference;
typedef typename batch_reference::reference element_reference;
typedef typename const_batch_reference::const_reference const_element_reference;
typedef boost::iterator_range< detail::DataElementIterator > > element_range;
typedef boost::iterator_range< detail::DataElementIterator const> > const_element_range;
typedef detail::BatchRange > batch_range;
typedef detail::BatchRange const> const_batch_range;
///\brief Returns the range of elements.
///
///It is compatible to boost::range and STL and can be used whenever an algorithm requires
///element access via begin()/end() in which case data.elements() provides the correct interface
const_element_range elements()const{
return const_element_range(
detail::DataElementIterator const>(this,0,0,0),
detail::DataElementIterator const>(this,numberOfBatches(),0,numberOfElements())
);
}
///\brief Returns therange of elements.
///
///It is compatible to boost::range and STL and can be used whenever an algorithm requires
///element access via begin()/end() in which case data.elements() provides the correct interface
element_range elements(){
return element_range(
detail::DataElementIterator >(this,0,0,0),
detail::DataElementIterator >(this,numberOfBatches(),0,numberOfElements())
);
}
///\brief Returns the range of batches.
///
///It is compatible to boost::range and STL and can be used whenever an algorithm requires
///element access via begin()/end() in which case data.elements() provides the correct interface
const_batch_range batches()const{
return const_batch_range(this);
}
///\brief Returns the range of batches.
///
///It is compatible to boost::range and STL and can be used whenever an algorithm requires
///element access via begin()/end() in which case data.elements() provides the correct interface
batch_range batches(){
return batch_range(this);
}
///\brief Returns the number of batches of the set.
std::size_t numberOfBatches() const{
return m_data.numberOfBatches();
}
///\brief Returns the total number of elements.
std::size_t numberOfElements() const{
return m_data.numberOfElements();
}
///\brief Check whether the set is empty.
bool empty() const{
return m_data.empty();
}
///\brief Access to inputs as a separate container.
InputContainer const& inputs() const{
return m_data;
}
///\brief Access to inputs as a separate container.
InputContainer& inputs(){
return m_data;
}
///\brief Access to labels as a separate container.
LabelContainer const& labels() const{
return m_label;
}
///\brief Access to labels as a separate container.
LabelContainer& labels(){
return m_label;
}
// CONSTRUCTORS
///\brief Empty data set.
LabeledData()
{}
///\brief Create an empty set with just the correct number of batches.
///
/// The user must initialize the dataset after that by himself.
LabeledData(std::size_t numBatches)
: m_data(numBatches),m_label(numBatches)
{}
///
/// Optionally the desired batch Size can be set
///
///@param size the new size of the container
///@param element the blueprint element from which to create the Container
///@param batchSize the size of the batches. if this is 0, the size is unlimited
LabeledData(std::size_t size, element_type const& element, std::size_t batchSize = DefaultBatchSize)
: m_data(size,element.input,batchSize),
m_label(size,element.label,batchSize)
{}
///\brief Construction from data.
///
/// Beware that when calling this constructor the organization of batches must be equal in both
/// containers. This Constructor will not split the data!
LabeledData(Data const& inputs, Data const& labels)
: m_data(inputs), m_label(labels)
{
SHARK_RUNTIME_CHECK(inputs.numberOfElements() == labels.numberOfElements(), "number of inputs and number of labels must agree");
#ifndef DNDEBUG
for(std::size_t i = 0; i != inputs.numberOfBatches(); ++i){
SIZE_CHECK(Batch::size(inputs.batch(i))==Batch::size(labels.batch(i)));
}
#endif
}
// ELEMENT ACCESS
element_reference element(std::size_t i){
return *(detail::DataElementIterator >(this,0,0,0)+i);
}
const_element_reference element(std::size_t i) const{
return *(detail::DataElementIterator const>(this,0,0,0)+i);
}
// BATCH ACCESS
batch_reference batch(std::size_t i){
return batch_reference(m_data.batch(i),m_label.batch(i));
}
const_batch_reference batch(std::size_t i) const{
return const_batch_reference(m_data.batch(i),m_label.batch(i));
}
///\brief Returns the Shape of the inputs.
Shape const& inputShape() const{
return m_data.shape();
}
///\brief Returns the Shape of the inputs.
Shape& inputShape(){
return m_data.shape();
}
///\brief Returns the Shape of the labels.
Shape const& labelShape() const{
return m_label.shape();
}
///\brief Returns the Shape of the labels.
Shape& labelShape(){
return m_label.shape();
}
// MISC
/// from ISerializable
void read(InArchive& archive){
archive & m_data;
archive & m_label;
}
/// from ISerializable
void write(OutArchive& archive) const{
archive & m_data;
archive & m_label;
}
///\brief This method makes the vector independent of all siblings and parents.
virtual void makeIndependent(){
m_label.makeIndependent();
m_data.makeIndependent();
}
void splitBatch(std::size_t batch, std::size_t elementIndex){
m_data.splitBatch(batch,elementIndex);
m_label.splitBatch(batch,elementIndex);
}
///\brief Splits the container into two independent parts. The left part remains in the container, the right is stored as return type
///
///Order of elements remain unchanged. The SharedVector is not allowed to be shared for
///this to work.
LabeledData splice(std::size_t batch){
return LabeledData(m_data.splice(batch),m_label.splice(batch));
}
/// \brief Appends the contents of another data object to the end
///
/// The batches are not copied but now referenced from both datasets. Thus changing the appended
/// dataset might change this one as well.
void append(LabeledData const& other){
m_data.append(other.m_data);
m_label.append(other.m_label);
}
void push_back(
typename Batch::type const& inputs,
typename Batch::type const& labels
){
m_data.push_back(inputs);
m_label.push_back(labels);
}
void push_back(
const_batch_reference batch
){
push_back(batch.input,batch.label);
}
///\brief Reorders the batch structure in the container to that indicated by the batchSizes vector
///
///After the operation the container will contain batchSizes.size() batches with the i-th batch having size batchSize[i].
///However the sum of all batch sizes must be equal to the current number of elements
template
void repartition(Range const& batchSizes){
m_data.repartition(batchSizes);
m_label.repartition(batchSizes);
}
/// \brief Creates a vector with the batch sizes of every batch.
///
/// This method can be used together with repartition to ensure
/// that two datasets have the same batch structure.
std::vector getPartitioning()const{
return m_data.getPartitioning();
}
friend void swap(LabeledData& a, LabeledData& b){
swap(a.m_data,b.m_data);
swap(a.m_label,b.m_label);
}
template
void reorderElements(Range const& indices){
m_data.reorderElements(indices);
m_label.reorderElements(indices);
}
///\brief shuffles all elements in the entire dataset (that is, also across the batches)
void shuffle(){
std::vector indices(numberOfElements());
std::iota(indices.begin(),indices.end(),0);
std::shuffle(indices.begin(),indices.end(), random::globalRng);
reorderElements(indices);
}
// SUBSETS
///\brief Fill in the subset defined by the list of indices.
LabeledData indexedSubset(IndexSet const& indices) const{
return LabeledData(m_data.indexedSubset(indices),m_label.indexedSubset(indices));
}
protected:
InputContainer m_data; /// point data
LabelContainer m_label; /// label data
};
/// specialized template for classification with unsigned int labels
typedef LabeledData ClassificationDataset;
/// specialized template for regression with RealVector labels
typedef LabeledData RegressionDataset;
/// specialized template for classification with unsigned int labels and sparse data
typedef LabeledData CompressedClassificationDataset;
template
struct TransformedData{
typedef Data::type > type;
};
namespace detail{
template
struct InferShape{
static Shape infer(T const&){return {};}
};
template
struct InferShape > >{
static Shape infer(Data > const& f){
return {f.element(0).size()};
}
};
template
struct InferShape > >{
static Shape infer(Data > const& f){
return {f.element(0).size()};
}
};
}
/**
* \addtogroup shark_globals
* @{
*/
/// \brief creates a data object from a range of elements
template
Data
createDataFromRange(Range const& inputs, std::size_t maximumBatchSize = 0){
typedef typename Range::value_type Input;
if (maximumBatchSize == 0)
maximumBatchSize = Data::DefaultBatchSize;
std::size_t numPoints = inputs.size();
//first determine the optimal number of batches as well as batch size
std::size_t batches = numPoints / maximumBatchSize;
if(numPoints > batches*maximumBatchSize)
++batches;
std::size_t optimalBatchSize=numPoints/batches;
std::size_t remainder = numPoints-batches*optimalBatchSize;
Data data(batches);
//now create the batches taking the remainder into account
auto start= inputs.begin();
for(std::size_t i = 0; i != batches; ++i){
std::size_t size = (i(
boost::make_iterator_range(start,end)
);
start = end;
}
data.shape() = detail::InferShape >::infer(data);
return data;
}
/// \brief creates a data object from a range of elements
template
UnlabeledData::type>
createUnlabeledDataFromRange(Range const& inputs, std::size_t maximumBatchSize = 0){
return createDataFromRange(inputs,maximumBatchSize);
}
/// \brief creates a labeled data object from two ranges, representing inputs and labels
template
LabeledData<
typename boost::range_value::type,
typename boost::range_value::type
> createLabeledDataFromRange(Range1 const& inputs, Range2 const& labels, std::size_t maximumBatchSize = 0){
SHARK_RUNTIME_CHECK(inputs.size() == labels.size(),"Number of inputs and number of labels must agree");
typedef typename boost::range_value::type Input;
typedef typename boost::range_value::type Label;
if (maximumBatchSize == 0)
maximumBatchSize = LabeledData::DefaultBatchSize;
return LabeledData(
createDataFromRange(inputs,maximumBatchSize),
createDataFromRange(labels,maximumBatchSize)
);
}
///brief Outstream of elements for labeled data.
template
std::ostream &operator << (std::ostream &stream, const LabeledData& d) {
for(auto elem: d.elements())
stream << elem.input << " [" << elem.label <<"]"<< "\n";
return stream;
}
// FUNCTIONS FOR DIMENSIONALITY
///\brief Return the number of classes of a set of class labels with unsigned int label encoding
inline unsigned int numberOfClasses(Data const& labels){
unsigned int classes = 0;
for(std::size_t i = 0; i != labels.numberOfBatches(); ++i){
classes = std::max(classes,*std::max_element(labels.batch(i).begin(),labels.batch(i).end()));
}
return classes+1;
}
///\brief Returns the number of members of each class in the dataset.
inline std::vector classSizes(Data const& labels){
std::vector classCounts(numberOfClasses(labels),0u);
for(std::size_t i = 0; i != labels.numberOfBatches(); ++i){
for(unsigned int elem: labels.batch(i)){
classCounts[elem]++;
}
}
return classCounts;
}
///\brief Return the dimensionality of a dataset.
template
std::size_t dataDimension(Data const& dataset){
SHARK_ASSERT(dataset.numberOfElements() > 0);
return dataset.element(0).size();
}
///\brief Return the input dimensionality of a labeled dataset.
template
std::size_t inputDimension(LabeledData const& dataset){
return dataDimension(dataset.inputs());
}
///\brief Return the label/output dimensionality of a labeled dataset.
template
std::size_t labelDimension(LabeledData const& dataset){
return dataDimension(dataset.labels());
}
///\brief Return the number of classes (highest label value +1) of a classification dataset with unsigned int label encoding
template
std::size_t numberOfClasses(LabeledData const& dataset){
return numberOfClasses(dataset.labels());
}
///\brief Returns the number of members of each class in the dataset.
template
inline std::vector classSizes(LabeledData const& dataset){
return classSizes(dataset.labels());
}
// TRANSFORMATION
///\brief Transforms a dataset using a Functor f and returns the transformed result.
///
/// this version is used, when the Functor supports only element-by-element transformations
template
typename boost::lazy_disable_if<
CanBeCalled::batch_type>,
TransformedData
>::type
transform(Data const& data, Functor f){
typedef typename detail::TransformedDataElement::type ResultType;
int batches = (int) data.numberOfBatches();
Data result(batches);
SHARK_PARALLEL_FOR(int i = 0; i < batches; ++i)
result.batch(i)= createBatch(
boost::make_transform_iterator(batchBegin(data.batch(i)), f),
boost::make_transform_iterator(batchEnd(data.batch(i)), f)
);
result.shape() = detail::InferShape >::infer(result);
return result;
}
///\brief Transforms a dataset using a Functor f and returns the transformed result.
///
/// this version is used, when the Functor supports batch-by-batch transformations
template
typename boost::lazy_enable_if<
CanBeCalled::batch_type>,
TransformedData
>::type
transform(Data const& data, Functor const& f){
typedef typename detail::TransformedDataElement::type ResultType;
int batches = (int) data.numberOfBatches();
Data result(batches);
SHARK_PARALLEL_FOR(int i = 0; i < batches; ++i)
result.batch(i)= f(data.batch(i));
Shape shape = detail::InferShape::infer(f);
if(shape == Shape()){
shape = detail::InferShape >::infer(result);
}
result.shape() = shape;
return result;
}
///\brief Transforms the inputs of a dataset and return the transformed result.
template
LabeledData::type, L >
transformInputs(LabeledData const& data, Functor const& f){
typedef LabeledData::type,L > DatasetType;
return DatasetType(transform(data.inputs(),f),data.labels());
}
///\brief Transforms the labels of a dataset and returns the transformed result.
template
LabeledData::type >
transformLabels(LabeledData const& data, Functor const& f){
typedef LabeledData::type > DatasetType;
return DatasetType(data.inputs(),transform(data.labels(),f));
}
///\brief Creates a copy of a dataset selecting only a certain set of features.
template
Data > selectFeatures(Data > const& data,FeatureSet const& features){
auto select = [&](blas::matrix const& input){
blas::matrix output(input.size1(),features.size());
for(std::size_t i = 0; i != input.size1(); ++i){
for(std::size_t j = 0; j != features.size(); ++j){
output(i,j) = input(i,features[j]);
}
}
return output;
};
return transform(data,select);
}
template
LabeledData selectInputFeatures(LabeledData const& data,FeatureSet const& features){
return LabeledData(selectFeatures(data.inputs(),features), data.labels());
}
/// \brief Removes the last part of a given dataset and returns a new split containing the removed elements
///
/// For this operation, the dataset is not allowed to be shared.
/// \brief data The dataset which should be splited
/// \brief index the first element to be split
/// \returns the set which contains the splitd element (right part of the given set)
template
DatasetT splitAtElement(DatasetT& data, std::size_t elementIndex){
SIZE_CHECK(elementIndex<=data.numberOfElements());
std::size_t batchPos = 0;
std::size_t batchStart = 0;
while(batchStart + batchSize(data.batch(batchPos)) < elementIndex){
batchStart += batchSize(data.batch(batchPos));
++batchPos;
};
std::size_t splitPoint = elementIndex-batchStart;
if(splitPoint != 0){
data.splitBatch(batchPos,splitPoint);
++batchPos;
}
return data.splice(batchPos);
}
///\brief reorders the dataset such, that points are grouped by labels
///
/// The elements are not only reordered but the batches are also resized such, that every batch
/// only contains elements of one class. This method must be used in order to use binarySubproblem.
template
void repartitionByClass(LabeledData& data,std::size_t batchSize = LabeledData::DefaultBatchSize){
std::vector classCounts = classSizes(data);
std::vector partitioning;//new, optimal partitioning of the data according to the batch sizes
std::vector classStart;//at which batch the elements of the class are starting
detail::batchPartitioning(classCounts, classStart, partitioning, batchSize);
data.repartition(partitioning);
std::vector classIndex(classCounts.size(),0);
for(std::size_t i = 1; i != classIndex.size();++i){
classIndex[i] = classIndex[i-1] + classCounts[i-1];
}
std::vector elemIndex(data.numberOfElements(), 0);
std::size_t index = 0;
for (auto const& elem: data.elements()){
std::size_t c = elem.label;
elemIndex[classIndex[c] ] = index;
++index;
++classIndex[c];
}
data.reorderElements(elemIndex);
}
template
LabeledData binarySubProblem(
LabeledDataconst& data,
unsigned int zeroClass,
unsigned int oneClass
){
std::vector indexSet;
std::size_t smaller = std::min(zeroClass,oneClass);
std::size_t bigger = std::max(zeroClass,oneClass);
std::size_t numBatches = data.numberOfBatches();
//find first class
std::size_t start= 0;
for(;start != numBatches && getBatchElement(data.batch(start),0).label != smaller;++start);
SHARK_RUNTIME_CHECK(start != numBatches, "First class does not exist");
//copy batch indices of first class
for(;start != numBatches && getBatchElement(data.batch(start),0).label == smaller; ++start)
indexSet.push_back(start);
//find second class
for(;start != numBatches && getBatchElement(data.batch(start),0).label != bigger;++start);
SHARK_RUNTIME_CHECK(start != numBatches, "Second class does not exist");
//copy batch indices of second class
for(;start != numBatches && getBatchElement(data.batch(start),0).label == bigger; ++start)
indexSet.push_back(start);
return transformLabels(data.indexedSubset(indexSet), [=](unsigned int label){return (unsigned int)(label == oneClass);});
}
/// \brief Construct a binary (two-class) one-versus-rest problem from a multi-class problem.
///
/// \par
/// The function returns a new LabeledData object. The input part
/// coincides with the multi-class data, but the label part is replaced
/// with binary labels 0 and 1. All instances of the given class
/// (parameter oneClass) get a label of one, all others are assigned a
/// label of zero.
template
LabeledData oneVersusRestProblem(
LabeledDataconst& data,
unsigned int oneClass
){
return transformLabels(data, [=](unsigned int label){return (unsigned int)(label == oneClass);});
}
template
RowType getColumn(Data const& data, std::size_t columnID) {
SHARK_ASSERT(dataDimension(data) > columnID);
RowType column(data.numberOfElements());
std::size_t rowCounter = 0;
for(auto element: data.elements()){
column(rowCounter) = element(columnID);
rowCounter++;
}
return column;
}
template
void setColumn(Data& data, std::size_t columnID, RowType newColumn) {
SHARK_ASSERT(dataDimension(data) > columnID);
SHARK_ASSERT(data.numberOfElements() == newColumn.size());
std::size_t rowCounter = 0;
for(auto element: data.elements()){
element(columnID) = newColumn(rowCounter);
rowCounter++;
}
}
/** @*/
}
#endif