//=========================================================================== /*! * * * \brief Fast lookup for elements in constant datasets * * * * * * \author O. Krause * \date 2012 * * * \par Copyright 1995-2017 Shark Development Team * *

* This file is part of Shark. * * * Shark is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Shark is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with Shark. If not, see . * */ //=========================================================================== #ifndef SHARK_DATA_DATAVIEW_H #define SHARK_DATA_DATAVIEW_H #include #include #include namespace shark { /// \brief Constant time Element-Lookup for Datasets /// /// Datasets are fast for random lookup of batches. Since batch sizes can be arbitrary structured and /// changed by the user, there is no way for the Data and LabeledData classes to provide fast random access /// to single elements. Still, this property is needed quite often, for example for creating subsets, /// randomize data or tree structures. /// A View stores the position of every element in a dataset. So it has constant time access to the elements but /// it also requires linear memory in the number of elements in the set. This is typically small compared /// to the size of the set itself, but construction imposes an considerable overhead. /// /// In contrast to (Un)LabeledData, which is centered around batches, the View is centered around single elements, /// so its iterators iterate over the elements. /// For a better support for bagging an index method is added which returns the position of the element in the /// underlying data container. Also the iterators are indexed and return this index. template //template parameter can be const! class DataView { public: typedef typename std::remove_const::type dataset_type; //(non const) type of the underlying dataset typedef typename dataset_type::element_type value_type; typedef typename dataset_type::const_element_reference const_reference; typedef typename dataset_type::batch_type batch_type; // We want to support immutable as well as mutable datasets. So we query whether the dataset // is mutable and change the reference type to const if the dataset is immutable. typedef typename boost::mpl::if_< std::is_const, typename dataset_type::const_element_reference, typename dataset_type::element_reference >::type reference; private: typedef typename boost::mpl::if_< std::is_const, typename dataset_type::const_batch_range, typename dataset_type::batch_range >::type batch_range; template class IteratorBase: public SHARK_ITERATOR_FACADE< IteratorBase, value_type, std::random_access_iterator_tag, Reference >{ public: IteratorBase(){} IteratorBase(View& view, std::size_t position) : mpe_view(&view),m_position(position) {} template IteratorBase(IteratorBase const& other) : mpe_view(other.mpe_view),m_position(other.position){} /// \brief returns the position of the element referenced by the iterator inside the dataset /// /// This is usefull for bagging, when identical elements between several susbsets are to be identified std::size_t index()const{ return mpe_view->index(m_position); } private: friend class SHARK_ITERATOR_CORE_ACCESS; template friend class IteratorBase; void increment() { ++m_position; } void decrement() { --m_position; } void advance(std::ptrdiff_t n){ m_position+=n; } template std::ptrdiff_t distance_to(IteratorBase const& other) const{ return (std::ptrdiff_t)other.m_position - (std::ptrdiff_t)m_position; } template bool equal(IteratorBase const& other) const{ return m_position == other.m_position; } Reference dereference() const { return (*mpe_view)[m_position]; } View* mpe_view; std::size_t m_position; }; public: typedef IteratorBase > iterator; typedef IteratorBase const > const_iterator; DataView(){} DataView(DatasetType& dataset) :m_dataset(dataset),m_indices(dataset.numberOfElements()) { std::size_t index = 0; for(std::size_t i = 0; i != dataset.numberOfBatches(); ++i){ std::size_t batchSize = Batch::size(dataset.batch(i)); for(std::size_t j = 0; j != batchSize; ++j,++index){ m_indices[index].batch = i; m_indices[index].positionInBatch = j; m_indices[index].datasetIndex = index; } } } /// create a subset of the dataset type using only the elemnt indexed by indices template DataView(DataView const& view, IndexRange const& indices) :m_dataset(view.m_dataset),m_indices(indices.size()) { for(std::size_t i = 0; i != m_indices.size(); ++i) m_indices[i] = view.m_indices[indices[i]]; } reference operator[](std::size_t position){ SIZE_CHECK(position < size()); Index const& index = m_indices[position]; return getBatchElement(m_dataset.batch(index.batch),index.positionInBatch); } const_reference operator[](std::size_t position) const{ SIZE_CHECK(position < size()); Index const& index = m_indices[position]; return getBatchElement(m_dataset.batch(index.batch),index.positionInBatch); } reference front(){ SIZE_CHECK(size() != 0); return (*this)[0]; } const_reference front()const{ SIZE_CHECK(size() != 0); return (*this)[0]; } reference back(){ SIZE_CHECK(size() != 0); return (*this)[size()-1]; } const_reference back()const{ SIZE_CHECK(size() != 0); return (*this)[size()-1]; } /// \brief Position of the element in the dataset. /// /// This is useful for bagging, when identical elements among /// several subsets are to be identified. std::size_t index(std::size_t position)const{ return m_indices[position].datasetIndex; } /// \brief Index of the batch holding the element. std::size_t batch(std::size_t position) const { return m_indices[position].batch; } /// \brief Index inside the batch holding the element. std::size_t positionInBatch(std::size_t position) const { return m_indices[position].positionInBatch; } std::size_t size() const{ return m_indices.size(); } iterator begin(){ return iterator(*this, 0); } const_iterator begin() const{ return const_iterator(*this, 0); } iterator end(){ return iterator(*this, size()); } const_iterator end() const{ return const_iterator(*this, size()); } dataset_type const& dataset()const{ return m_dataset; } private: dataset_type m_dataset; // Stores for an element of the dataset, at which position of which batch it is located // as well as the real index of the element inside the dataset struct Index{ std::size_t batch;//the batch in which the element is located std::size_t positionInBatch;//at which position in the batch it is std::size_t datasetIndex;//index inside the dataset }; std::vector m_indices;//stores for every element of the set it's position inside the dataset }; /// \brief creates a subset of a DataView with elements indexed by indices /// /// \param view the view for which the subset is to be created /// \param indizes the index of the elements to be stored in the view template DataView subset(DataView const& view, IndexRange const& indizes){ //O.K. todo: Remove constructor later on, this is a quick fix. return DataView(view,indizes); } /// \brief creates a random subset of a DataView with given size /// /// \param view the view for which the subset is to be created /// \param size the size of the subset template DataView randomSubset(DataView const& view, std::size_t size){ std::vector indices(view.size()); std::iota(indices.begin(),indices.end(),0); partial_shuffle(indices.begin(),indices.begin()+size,indices.end()); return subset(view,boost::make_iterator_range(indices.begin(),indices.begin()+size)); } /// \brief Creates a batch given a set of indices /// /// \param view the view from which the batch is to be created /// \param indizes the set of indizes defining the batch template typename DataView::batch_type subBatch( DataView const& view, IndexRange const& indizes ){ //create a subset of the view containing the elements of the batch DataView batchElems = subset(view,indizes); //and now use the batch range construction to create it return createBatch(batchElems); } /// \brief Creates a random batch of a given size /// /// \param view the view from which the batch is to be created /// \param size the size of the batch template typename DataView::batch_type randomSubBatch( DataView const& view, std::size_t size ){ std::vector indices(view.size()); std::iota(indices.begin(),indices.end(),0); partial_shuffle(indices.begin(),indices.begin()+size,indices.end()); return subBatch(view,boost::make_iterator_range(indices.begin(),indices.begin()+size)); } /// \brief Creates a View from a dataset. /// /// This is just a helper function to omit the actual type of the view /// /// \param set the dataset from which to create the view template DataView toView(DatasetType& set){ return DataView(set); } /// \brief Creates a new dataset from a View. /// /// When the elements of a View needs to be processed repeatedly it is often better to use /// the packed format of the Dataset again, since then the faster batch processing can be used /// /// \param view the view from which to create the new dataset /// \param batchSize the size of the batches in the dataset template typename DataView::dataset_type toDataset(DataView const& view, std::size_t batchSize = DataView::dataset_type::DefaultBatchSize){ if(view.size() == 0) return typename DataView::dataset_type(); //O.K. todo: this is slow for sparse elements, use subBatch or something similar. std::size_t elements = view.size(); typename DataView::dataset_type dataset(elements,view[0],batchSize); std::copy(view.begin(),view.end(),dataset.elements().begin()); return dataset; } /// Return the number of classes (size of the label vector) /// of a classification dataset with RealVector label encoding. template std::size_t numberOfClasses(DataView const& view){ return numberOfClasses(view.dataset()); } /// Return the input dimensionality of the labeled dataset represented by the view template std::size_t inputDimension(DataView const& view){ return inputDimension(view.dataset()); } /// Return the label dimensionality of the labeled dataset represented by the view template std::size_t labelDimension(DataView const& view){ return labelDimension(view.dataset()); } /// Return the dimensionality of the dataset represented by the view template std::size_t dataDimension(DataView const& view){ return dataDimension(view.dataset()); } /** @*/ } #endif