/*!
*
* \brief Internal functionality and implementation of the Data class
*
* \author O. Krause
* \date 2012
*
* \par Copyright 1995-2017 Shark Development Team
*
*
* This file is part of Shark.
*
*
* Shark is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Shark is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Shark. If not, see .
*
*/
#ifndef SHARK_DATA_IMPL_DATASET_INL
#define SHARK_DATA_IMPL_DATASET_INL
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
namespace shark {
namespace detail{
/**
* \ingroup shark_detail
*
* @{
*/
///\brief Computes a partitioning of a st of elements in batches.
///
/// Given a number of elements and the maximum size of a batch,
/// computes the optimal number of batches and returns the size of every batch such that
/// all batches have as equal a size as possible.
///
/// \param numElements number of elements to partition
/// \param maximumBatchSize the maximum size of a batch
/// \return a vector with th size of every batch
inline std::vector optimalBatchSizes(std::size_t numElements, std::size_t maximumBatchSize){
std::vector batchSizes;
std::size_t batches = numElements / maximumBatchSize;
if(numElements-batches*maximumBatchSize > 0)
++batches;
std::size_t optimalBatchSize=numElements/batches;
std::size_t remainder = numElements-batches*optimalBatchSize;
for(std::size_t j = 0; j != batches; ++j){
std::size_t size = (j const& partitionSizes,
std::vector& partitionStart,
std::vector& batchSizes,
std::size_t maximumBatchSize
){
std::size_t sumOfBatches = 0;
std::size_t numberOfPartitions=partitionSizes.size();
for (std::size_t i = 0; i != numberOfPartitions; i++){
partitionStart.push_back(sumOfBatches);
std::vector batchSizesOfPartition = optimalBatchSizes(partitionSizes[i],maximumBatchSize);
batchSizes.insert(batchSizes.end(),batchSizesOfPartition.begin(),batchSizesOfPartition.end());
sumOfBatches+=batchSizesOfPartition.size();
}
return sumOfBatches;
}
/// compute the complement of the indices with respect to the set [0,...n[
template
void complement(
T const& set,
std::size_t n,
T2& comp)
{
std::vector parentSet(n);
for(std::size_t i = 0; i != n; ++i){
parentSet[i]=i;
}
std::vector setCopy(set.begin(),set.end());
std::sort(setCopy.begin(),setCopy.end());
std::vector resultSet(parentSet.size());
std::vector::iterator pos = std::set_difference(
parentSet.begin(),parentSet.end(),
setCopy.begin(),setCopy.end(),
resultSet.begin()
);
comp.resize(std::distance(resultSet.begin(),pos));
std::copy(resultSet.begin(),pos,comp.begin());
}
/// compute the index set for the range [0, ..., size[
template
void range(size_t size, T& indices){
indices.resize(size);
for (size_t i=0; i
void range(size_t size,size_t start, T& indices){
indices.resize(size);
for (size_t i=0; i
class SharedContainer : public ISerializable
{
public:
typedef typename Batch::type BatchType;
typedef typename Batch::reference reference;
typedef typename Batch::const_reference const_reference;
private:
typedef Batch BatchTraits;
typedef std::vector > Container;
public:
///\brief Create an empty container.
SharedContainer(){}
///\brief creates a shared container with a number of empty batches.
SharedContainer(std::size_t numBatches){
m_data.resize(numBatches);
for(std::size_t i = 0; i != numBatches; ++i){
m_data[i] = boost::make_shared();
}
}
///\brief Create an empty container of specified size with copies of an element
///
///@param size the new size of the container
///@param element the blueprint element from which to create the Container
///@param batchSize the size of the batches. if this is 0, the size is unlimited
SharedContainer(std::size_t size, Type const& element, std::size_t batchSize){
initializeBatches(size,element,batchSize);
}
///\brief Create container from new data.
///
///Creates the SharedContainer and splits the incoming data into several batches
///
///@param data the data from which to create the Container
///@param maximumBatchSize The size of the batches. If set to 0, the size is unlimited
template
SharedContainer(Range const& data, std::size_t maximumBatchSize){
SIZE_CHECK(data.size() != 0 );
std::size_t points = data.size();
if(maximumBatchSize == 0)
maximumBatchSize = points;
//first determin the optimal number of batches as well as batch size
std::size_t batches = points / maximumBatchSize;
if(points > batches*maximumBatchSize)
++batches;
std::size_t optimalBatchSize=points/batches;
std::size_t remainder = points-batches*optimalBatchSize;
//now create the batches taking the remainder into account
m_data.reserve(batches);
std::size_t batchStart = 0;
for(std::size_t i = 0; i != batches; ++i){
std::size_t size = (i::createBatch(
boost::make_iterator_range(boost::begin(data)+batchStart,boost::begin(data)+batchEnd)
));
batchStart+=size;
}
}
///\brief Create a shared container as subset of another container
///
///@param container the container from which the subset is to be generated
///@param indizes indizes indicating which batches should be shared
SharedContainer(SharedContainer const& container, std::vector const& indizes){
m_data.resize(indizes.size());
for(std::size_t i = 0; i != indizes.size(); ++i){
SIZE_CHECK(indizes[i] < container.size());
m_data[i] = container.m_data[indizes[i]];
}
}
/// \brief Clear the contents of this container without affecting the others.
void clear(){
m_data.clear();
}
///\brief check whether the set is empty
bool empty() const{
return size() == 0;
}
///\brief Return the number of batches.
std::size_t size() const{
return m_data.size();
}
///\brief Computes the total number of elements
std::size_t numberOfElements()const{
std::size_t numElems = 0;
for(std::size_t i = 0; i != m_data.size(); ++i){
numElems+=BatchTraits::size(*m_data[i]);
}
return numElems;
}
boost::shared_ptr const& pointer(std::size_t i)const{
return m_data[i];
}
BatchType const& batch(std::size_t i)const{
SIZE_CHECK(i < size());
return *m_data[i];
}
BatchType& batch(std::size_t i){
SIZE_CHECK(i < size());
return *m_data[i];
}
template bool operator == (const SharedContainer& rhs) {
return (m_data == rhs.m_data);
}
////////////////////////////ITERATOR INTERFACE//////////////////////////////////////
///////////ITERATORS OVER THE BATCHES//////////////////
typedef boost::indirect_iterator< typename Container::const_iterator,const BatchType, boost::use_default, BatchType const& > const_iterator;
typedef boost::indirect_iterator< typename Container::iterator > iterator;
///\brief Iterator access over the batches.
const_iterator begin() const{
return const_iterator(m_data.begin());
}
///\brief Iterator access over the batches.
const_iterator end() const{
return const_iterator(m_data.end());
}
///\brief Iterator access over the batches.
iterator begin(){
return iterator(m_data.begin());
}
///\brief Iterator access over the batches.
iterator end(){
return iterator(m_data.end());
}
///////////////////////ADDING NEW BATCHES////////////////////////
void push_back(BatchType const& batch){
m_data.push_back(boost::make_shared(batch));
}
void append(SharedContainer const& other){
m_data.insert(m_data.end(),other.m_data.begin(),other.m_data.end());
}
////////////////////////////SPLITTING//////////////////////////////////////
///\brief splits the batch indicated by the iterator at elementIndex in two parts.
///
///Order of elements remain unchanged. SharedContainer is not allowed to be shared for
///this to work.
void splitBatch(iterator position, std::size_t elementIndex){
SHARK_RUNTIME_CHECK(isIndependent(), "Container is not Independent");
SIZE_CHECK(elementIndex <= batchSize(*position));
BatchType& source=*position;
std::size_t leftElements = elementIndex;
std::size_t rightElements = batchSize(source)-leftElements;
if(leftElements == 0 || rightElements == 0)
return;
auto leftSplit = boost::make_shared(BatchTraits::createBatch(getBatchElement(source,0),leftElements));
auto rightSplit = boost::make_shared(BatchTraits::createBatch(getBatchElement(source,0),rightElements));
std::copy(BatchTraits::begin(source),BatchTraits::begin(source)+leftElements,BatchTraits::begin(*leftSplit));
std::copy(BatchTraits::begin(source)+leftElements,BatchTraits::end(source),BatchTraits::begin(*rightSplit));
*(position.base())=rightSplit;//override old batch
m_data.insert(position.base(),leftSplit);
}
///\brief Splits the container in two independent parts. The left part remains in the container, the right is stored as return type
///
///Order of elements remain unchanged. The SharedContainer is not allowed to be shared for
///this to work.
SharedContainer splice(iterator position){
SHARK_RUNTIME_CHECK(isIndependent(), "Container is not Independent");
SharedContainer right;
right.m_data.assign(position.base(),m_data.end());
m_data.erase(position.base(),m_data.end());
return right;
}
///\brief Reorders the batch structure in the container to that indicated by the batchSizes vector
///
///After the operation the container will contain batchSizes.size() batchs with the i-th batch having size batchSize[i].
///However the sum of all batch sizes must be equal to the current number of elements
template
void repartition(Range const& batchSizes){
std::size_t sum = 0;
for(std::size_t i: batchSizes)
sum += i;
SIZE_CHECK(sum == numberOfElements());
SHARK_RUNTIME_CHECK(isIndependent(), "Container is not Independent");
Container newPartitioning;
std::size_t currentBatch = 0;
std::size_t currentBatchIndex = 0;
for(std::size_t i = 0; i != batchSizes.size(); ++i){
//create new batch
std::size_t currentBatchSize = batchSizes[i];
boost::shared_ptr newBatch = boost::make_shared(BatchTraits::createBatch(getBatchElement(batch(currentBatch),0),currentBatchSize));
for(std::size_t j = 0; j != currentBatchSize; ++j){
getBatchElement(*newBatch,j)=getBatchElement(batch(currentBatch),currentBatchIndex);
++currentBatchIndex;
if(currentBatchIndex == BatchTraits::size(batch(currentBatch))){
m_data[currentBatch].reset();//free old memory
++currentBatch;
currentBatchIndex = 0;
}
}
newPartitioning.push_back(newBatch);
}
//sanity check
SIZE_CHECK(currentBatch == size());
//swap old(mpty) with new partitioning
swap(m_data,newPartitioning);
}
/// \brief Creates a vector with the batch sizes of every batch.
///
/// This method can be used together with repartition to ensure
/// that two SharedContainers have the same batch structure.
std::vector getPartitioning()const{
std::vector batchSizes(size());
for(std::size_t i = 0; i != size(); ++i){
batchSizes[i] = BatchTraits::size(*m_data[i]);
}
return batchSizes;
}
/////////////////////MISC/////////////////////////////////
///\brief Is the container independent of all others?
///
/// In other words, does it NOT share data?
/// This method checks for every batch if it is shared. So it should not be called too often.
bool isIndependent() const{
for(std::size_t i = 0; i != m_data.size(); ++i){
if(!m_data[i].unique()){
return false;
}
}
return true;
}
///\brief Ensures that the container is independent.
///
/// Makes sure that the data of this instance can be
/// modified without affecting other containers. If
/// necessary, a deep copy of the data is made.
void makeIndependent(){
if (isIndependent()){
return;
}
Container dataCopy(m_data.size());
for(std::size_t i = 0; i != m_data.size(); ++i){
dataCopy[i].reset(new BatchType(*(m_data[i])));
}
using std::swap;
swap(m_data,dataCopy);
}
/// from ISerializable
void read(InArchive& archive){
archive & m_data;
}
/// from ISerializable
void write(OutArchive& archive) const{
archive & m_data;
}
friend void swap(SharedContainer& a, SharedContainer& b){
a.m_data.swap(b.m_data);
}
private:
/// \brief Shared storage for the element batches.
Container m_data;
void initializeBatches(std::size_t numElements, Type const& element,std::size_t batchSize){
m_data.clear();
if(batchSize == 0|| batchSize > numElements){
push_back(BatchTraits::createBatch(element,numElements));
}
else
{
std::size_t batches = numElements/batchSize+(numElements%batchSize > 0);
m_data.reserve(batches);
std::size_t finalBatchSize = numElements;
for(std::size_t batch = 0; batch != batches-1; ++batch){
push_back(BatchTraits::createBatch(element,batchSize));
finalBatchSize-=batchSize;
}
push_back(BatchTraits::createBatch(element,finalBatchSize));
}
}
};
template
struct BatchRange{
typedef IndexingIterator > iterator;
typedef IndexingIterator const > const_iterator;
typedef typename C::batch_type value_type;
typedef typename boost::mpl::if_<
std::is_const,
typename C::const_batch_reference,
typename C::batch_reference
>::type reference;
typedef typename C::const_batch_reference const_reference;
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
BatchRange()=default;
BatchRange(C* container):m_container(container){}
template
BatchRange(BatchRange const& other):m_container(other.m_container){}
std::size_t size()const{
return m_container->numberOfBatches();
}
bool empty()const{return size() == 0;}
iterator begin(){
return iterator(*this,0);
}
const_iterator begin()const{
return const_iterator(*this,0);
}
iterator end(){
return iterator(*this,size());
}
const_iterator end()const{
return const_iterator(*this,size());
}
reference operator[](std::size_t i){
return m_container->batch(i);
}
const_reference operator[](std::size_t i)const{
return m_container->batch(i);
}
reference front(){
return m_container->batch(0);
}
const_reference front()const{
return m_container->batch(0);
}
private:
template friend struct BatchRange;
C* m_container;
};
template
class DataElementIterator:
public SHARK_ITERATOR_FACADE<
DataElementIterator,
typename Dataset::element_type,
std::random_access_iterator_tag,
typename boost::mpl::if_<
std::is_const,
typename Dataset::const_element_reference,
typename Dataset::element_reference
>::type
>{
private:
Dataset* m_container;
std::size_t m_batchPosition;
std::size_t m_elementPosition;
std::size_t m_positionInSequence;
public:
typedef typename boost::mpl::if_<
std::is_const,
typename Dataset::const_element_reference,
typename Dataset::element_reference
>::type reference;
DataElementIterator()
:m_positionInSequence(0){}
DataElementIterator(
Dataset* container,
std::size_t batchPosition,
std::size_t elementPosition,
std::size_t positionInSequence
):m_container(container),
m_batchPosition(batchPosition),
m_elementPosition(elementPosition),
m_positionInSequence(positionInSequence){}
template
DataElementIterator(DataElementIterator const& other)
:m_container(other.m_container),
m_batchPosition(other.m_batchPosition),
m_elementPosition(other.m_elementPosition),
m_positionInSequence(other.m_positionInSequence){}
template
DataElementIterator operator=(DataElementIterator const& other){
m_container = other.m_container;
m_batchPosition = other.m_batchPosition;
m_elementPosition = other.m_elementPosition;
m_positionInSequence = other.m_positionInSequence;
}
std::size_t index()const{
return m_positionInSequence;
}
auto getInnerIterator()const ->decltype (batchBegin(m_container->batch(m_batchPosition))){
return batchBegin(m_container->batch(m_batchPosition)) + m_elementPosition;
}
private:
friend class SHARK_ITERATOR_CORE_ACCESS;
template friend class DataElementIterator;
void increment() {
++m_positionInSequence;
++m_elementPosition;
if(m_elementPosition == batchSize(m_container->batch(m_batchPosition))){
++m_batchPosition;
m_elementPosition = 0;
}
}
void decrement() {
SIZE_CHECK(m_positionInSequence);//don't call this method when the iterator is on the first element
--m_positionInSequence;
if(m_elementPosition == 0){
--m_batchPosition;
m_elementPosition = batchSize(m_container->batch(m_batchPosition));
}
--m_elementPosition;
}
//this is not exactly O(1) as the standard wants. in fact it's O(n) in the number of inner sequences
//so approximately O(1) if the size of a sequence is big...
void advance(std::ptrdiff_t n){
m_positionInSequence += n;
n += m_elementPosition;//jump from the start of the current inner sequence
m_elementPosition = 0;
if( n == 0)
return;
if(n < 0){
std::size_t npos = -n;
--m_batchPosition;
--npos;
//jump over the outer position until we are in the correct range again
while (npos != 0 && npos >= batchSize(m_container->batch(m_batchPosition)) ){
npos -= batchSize(m_container->batch(m_batchPosition));
--m_batchPosition;
}
m_elementPosition = batchSize(m_container->batch(m_batchPosition)) - 1 - npos;
}
else{
std::size_t npos = n;
//jump over the outer position until we are in the correct range again
while (npos != 0 && npos >= batchSize(m_container->batch(m_batchPosition))){
npos -= batchSize(m_container->batch(m_batchPosition));
++m_batchPosition;
SHARK_RUNTIME_CHECK(m_batchPosition != m_container->numberOfBatches() || (npos == 0), "iterator went past the end");
}
m_elementPosition = npos;
}
}
template
std::ptrdiff_t distance_to(const Iter& other) const{
return (std::ptrdiff_t)other.m_positionInSequence - (std::ptrdiff_t)m_positionInSequence;
}
template
bool equal(Iter const& other) const{
return m_positionInSequence == other.m_positionInSequence;
}
reference dereference() const {
auto&& batch = m_container->batch(m_batchPosition);
return getBatchElement(batch,m_elementPosition);
}
};
/// \brief For Data and functor F calculates the result of the resulting elements F(T).
template
struct TransformedDataElement{
private:
template
struct TransformedDataElementTypeFromBatch{
typedef typename batch_to_element<
typename std::result_of::type
>::type type;
};
public:
typedef typename boost::mpl::eval_if<
CanBeCalled,
std::result_of,
TransformedDataElementTypeFromBatch<
typename Batch::type
>
>::type type;
};
/** @*/
}
///\brief Input-Label pair of data
template
struct InputLabelPair{
InputType input;
LabelType label;
InputLabelPair(){}
template
InputLabelPair(
I&& input,
L&& label
):input(input),label(label){}
template
InputLabelPair(
InputLabelPair const& pair
):input(pair.input),label(pair.label){}
InputLabelPair& operator=(
InputLabelPair const& pair
){
input = pair.input;
label = pair.label;
return *this;
}
template
InputLabelPair& operator=(
InputLabelPair const& pair
){
input = pair.input;
label = pair.label;
return *this;
}
friend bool operator<(InputLabelPair const& op1, InputLabelPair const& op2){
return op1.label < op2.label;
}
};
template
void swap(InputLabelPair&& p1, InputLabelPair&& p2){
using std::swap;
swap(p1.input,p2.input);
swap(p1.label,p2.label);
}
///\brief Input label pair of batches
template
struct InputLabelBatch{
private:
typedef typename BatchTraits::type >::type Batch1Traits;
typedef typename BatchTraits::type >::type Batch2Traits;
public:
Batch1Type input;
Batch2Type label;
typedef InputLabelPair<
typename Batch1Traits::value_type,
typename Batch2Traits::value_type
> value_type;
//the decltype below adds correct const semantic if the template arguments are references.
//the behaviour is the same as mimiking the pair {getBatchElement(input,i), getBatchElement(label,i)}
//depending on whether input or label are const or not (which for reference types should not make any difference)
typedef InputLabelPair<
decltype(getBatchElement(std::declval(),0)),
decltype(getBatchElement(std::declval(),0))
> reference;
typedef InputLabelPair<
decltype(getBatchElement(std::declval::type&>(),0)),
decltype(getBatchElement(std::declval::type&>(),0))
> const_reference;
typedef IndexingIterator iterator;
typedef IndexingIterator const_iterator;
template
InputLabelBatch(
I&& input,
L&& label
):input(input),label(label){}
template
InputLabelBatch(
std::size_t size,Pair const& p
):input(Batch1Traits::createBatch(p.input,size)),label(Batch2Traits::createBatch(p.label,size)){}
template
InputLabelBatch& operator=(InputLabelBatch const& batch){
input = batch.input;
label = batch.label;
return *this;
}
std::size_t size()const{
return Batch1Traits::size(input);
}
iterator begin(){
return iterator(*this,0);
}
const_iterator begin()const{
return const_iterator(*this,0);
}
iterator end(){
return iterator(*this,size());
}
const_iterator end()const{
return const_iterator(*this,size());
}
reference operator[](std::size_t i){
return reference(getBatchElement(input,i),getBatchElement(label,i));
}
const_reference operator[](std::size_t i)const{
return const_reference(getBatchElement(input,i),getBatchElement(label,i));
}
};
template
void swap(InputLabelBatch& p1, InputLabelBatch& p2){
using std::swap;
swap(p1.input,p2.input);
swap(p1.label,p2.label);
}
template
struct Batch >
: public detail::SimpleBatch<
InputLabelBatch::type, typename detail::element_to_batch::type>
>{};
template
struct BatchTraits >{
typedef typename detail::batch_to_element::type InputElem;
typedef typename detail::batch_to_element::type LabelElem;
typedef Batch > type;
};
}
#endif