//=========================================================================== /*! * * * \brief Learning problems given by analytic distributions. * * * * * \author T. Glasmachers * \date 2006-2013 * * * \par Copyright 1995-2017 Shark Development Team * *

* This file is part of Shark. * * * Shark is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Shark is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with Shark. If not, see . * */ //=========================================================================== #ifndef SHARK_DATA_DATADISTRIBUTION_H #define SHARK_DATA_DATADISTRIBUTION_H #include #include #include #include namespace shark { /// /// \brief A DataDistribution defines an unsupervised learning problem. /// /// \par /// The unsupervised learning problem is defined by an explicit /// distribution (in contrast to a finite dataset). The only /// method we need is to draw a sample from the distribution. /// template class DataDistribution { public: /// \brief Virtual destructor. virtual ~DataDistribution() { } /// \brief Generates a single pair of input and label. /// /// @param input the generated input virtual void draw(InputType& input) const = 0; // \brief Interface for std::generate. InputType operator() () { InputType ret; draw(ret); return ret; } /// \brief Generates a data set with samples from from the distribution. /// /// @param size the number of samples in the dataset /// @param maximumBatchSize the maximum size of a batch UnlabeledData generateDataset(std::size_t size,std::size_t maximumBatchSize) const { std::vector data(size); // draw the samples for (std::size_t i = 0; i < size; ++i){ draw(data[i]); } //create dataset return createUnlabeledDataFromRange(data,maximumBatchSize); } /// \brief Generates a data set with samples from from the distribution. /// /// @param size the number of samples in the dataset UnlabeledData generateDataset(std::size_t size) const { return generateDataset(size,Data::DefaultBatchSize ); } }; /// /// \brief A LabeledDataDistribution defines a supervised learning problem. /// /// \par /// The supervised learning problem is defined by an explicit /// distribution (in contrast to a finite dataset). The only /// method we need is to draw a sample from the distribution. /// template class LabeledDataDistribution { public: /// \brief Virtual destructor. virtual ~LabeledDataDistribution() { } /// \brief Generates a single pair of input and label. /// @param input the generated input /// @param label the generated label virtual void draw(InputType& input, LabelType& label) const = 0; // \Brief Interface for std::generate. std::pair operator() () { std::pair ret; draw(ret.first,ret.second); return ret; } /// \brief Generates a dataset with samples from from the distribution. /// /// @param size the number of samples in the dataset /// @param maximumBatchSize the maximum size of a batch LabeledData generateDataset(std::size_t size,std::size_t maximumBatchSize) const{ std::vector inputs(size); std::vector labels(size); // draw the samples for (std::size_t i = 0; i < size; ++i){ draw(inputs[i], labels[i]); } //create dataset return createLabeledDataFromRange(inputs,labels,maximumBatchSize); } /// \brief Generates a data set with samples from from the distribution. /// /// @param size the number of samples in the dataset LabeledData generateDataset(std::size_t size) const { return generateDataset(size,LabeledData::DefaultBatchSize ); } }; /// /// \brief "chess board" problem for binary classification /// class Chessboard : public LabeledDataDistribution { public: Chessboard(unsigned int size = 4, double noiselevel = 0.0) { m_size = size; m_noiselevel = noiselevel; } void draw(RealVector& input, unsigned int& label)const{ input.resize(2); unsigned int j, t = 0; for (j = 0; j < 2; j++) { double v = random::uni(random::globalRng, 0.0, (double)m_size); t += (int)floor(v); input(j) = v; } label = (t & 1); if (random::uni(random::globalRng, 0.0, 1.0) < m_noiselevel) label = 1 - label; } protected: unsigned int m_size; double m_noiselevel; }; /// /// \brief Noisy sinc function: y = sin(x) / x + noise /// class Wave : public LabeledDataDistribution { public: Wave(double stddev = 0.1, double range = 5.0){ m_stddev = stddev; m_range = range; } void draw(RealVector& input, RealVector& label)const{ input.resize(1); label.resize(1); input(0) = random::uni(random::globalRng, -m_range, m_range); if(input(0) != 0) label(0) = sin(input(0)) / input(0) + random::gauss(random::globalRng, 0.0, m_stddev); else label(0) = random::gauss(random::globalRng, 0.0, m_stddev); } protected: double m_stddev; double m_range; }; /// "Pami Toy" problem for binary classification, as used in the article "Glasmachers /// and C. Igel. Maximum Likelihood Model Selection for 1-Norm Soft Margin SVMs with Multiple /// Parameters. IEEE Transactions on Pattern Analysis and Machine Intelligence, 2010." /// In summary, the first M dimensions are correlated to the labels, the last N dimensions /// are not. class PamiToy : public LabeledDataDistribution { public: PamiToy(unsigned int size_useful = 5, unsigned int size_noise = 5, double noise_position = 0.0, double noise_variance = 1.0 ) : m_size( size_useful+size_noise ), m_sizeUseful( size_useful ), m_sizeNoise( size_noise ), m_noisePos( noise_position) , m_noiseVar( noise_variance ) { } void draw(RealVector& input, unsigned int& label)const{ input.resize( m_size ); label = (unsigned int) random::discrete(random::globalRng, 0,1); //fix label first double y2 = label - 0.5; //"clean" informative feature values // now fill the informative features.. for ( unsigned int i=0; i { public: CircleInSquare( unsigned int dimensions = 2, double noiselevel = 0.0, bool class_prob_equal = false ) : m_dimensions( dimensions ), m_noiselevel( noiselevel ), m_lowerLimit( -1 ), m_upperLimit( 1 ), m_centerpoint( 0 ), m_inner_radius2( 0.5*0.5 ), m_outer_radius2( 0.5*0.5 ), m_equal_class_prob( class_prob_equal ) { } /// allow for arbitrary box limits void setLimits( double lower_limit, double upper_limit, double inner_radius, double outer_radius ) { RANGE_CHECK( lower_limit < upper_limit ); RANGE_CHECK( inner_radius <= outer_radius ); RANGE_CHECK( 2*outer_radius <= upper_limit-lower_limit ); m_lowerLimit = lower_limit; m_upperLimit = upper_limit; m_centerpoint = (upper_limit-lower_limit)/2.0; m_inner_radius2 = inner_radius*inner_radius; m_outer_radius2 = outer_radius*outer_radius; } void draw(RealVector& input, unsigned int& label)const { input.resize( m_dimensions ); double v, dist; if ( m_equal_class_prob ) { //each class has equal probability - this implementation is brute-force and gorgeously inefficient :/ bool this_label = random::coinToss(random::globalRng); label = ( this_label ? 1 : 0 ); if ( random::uni(random::globalRng, 0.0, 1.0) < m_noiselevel ) label = 1 - label; if ( this_label ) { do { dist = 0.0; for ( unsigned int i=0; i m_inner_radius2 ); } else { do { dist = 0.0; for ( unsigned int i=0; i m_inner_radius2 && dist < m_outer_radius2 ); } } protected: unsigned int m_dimensions; double m_noiselevel; double m_lowerLimit; double m_upperLimit; double m_centerpoint; double m_inner_radius2; double m_outer_radius2; bool m_equal_class_prob; /// { public: DiagonalWithCircle( double radius = 1.0, double noise = 0.0 ) : m_radius2( radius*radius ), m_noiselevel( noise ) { } void draw(RealVector& input, unsigned int& label)const { input.resize( 2 ); double x,y; x = random::uni(random::globalRng, 0, 4 ); //zero is left y = random::uni(random::globalRng, 0, 4 ); //zero is bottom // assign label according to position w.r.t. the diagonal if ( x+y < 4 ) label = 1; else label = 0; // but if in the circle (even above diagonal), assign positive label if ( (3-x)*(3-x) + (1-y)*(1-y) < m_radius2 ) label = 1; // add noise if ( random::uni(random::globalRng, 0.0, 1.0) < m_noiselevel ) label = 1 - label; input(0) = x; input(1) = y; } protected: double m_radius2; double m_noiselevel; }; /// \brief Generates a set of normally distributed points class NormalDistributedPoints:public DataDistribution { public: /// \brief Generates a simple distribution with NormalDistributedPoints(std::size_t dim): m_offset(dim,0){ RealMatrix covariance(dim,dim,0); diag(covariance) = blas::repeat(1.0,dim); m_dist.setCovarianceMatrix(covariance); } NormalDistributedPoints(RealMatrix const& covariance, RealVector const& offset) :m_dist(covariance), m_offset(offset){ SIZE_CHECK(offset.size() == covariance.size1()); } void draw(RealVector& input) const{ input.resize(m_offset.size()); noalias(input) = m_offset; noalias(input) += m_dist(random::globalRng).first; } private: MultiVariateNormalDistributionCholesky m_dist; RealVector m_offset; }; /// \brief Given a set of images, draws a set of image patches of a given size class ImagePatches:public DataDistribution{ public: ImagePatches( Data images, std::size_t imageWidth, std::size_t imageHeight, std::size_t patchWidth, std::size_t patchHeight ):m_images(images) , m_imageWidth(imageWidth) , m_imageHeight(imageHeight) , m_patchWidth(patchWidth) , m_patchHeight(patchHeight) ,m_numImages(m_images.numberOfElements()){} void draw(RealVector& input) const{ //sample image std::size_t imageNum = random::discrete(random::globalRng, std::size_t(0),m_numImages-1); Data::const_element_reference image = m_images.element(imageNum); //draw the upper left corner of the image std::size_t m_startX = random::discrete(random::globalRng, std::size_t(0),m_imageWidth-m_patchWidth); std::size_t m_startY = random::discrete(random::globalRng, std::size_t(0),m_imageHeight-m_patchHeight); //copy patch input.resize(m_patchWidth * m_patchHeight); std::size_t rowStart = m_startY * m_imageWidth + m_startX; for (size_t y = 0; y < m_patchHeight; ++y){ for (size_t x = 0; x < m_patchWidth; ++x){ input(y * m_patchWidth + x) = image(rowStart+x); } rowStart += m_imageWidth; } } private: Data m_images; std::size_t m_imageWidth; std::size_t m_imageHeight; std::size_t m_patchWidth; std::size_t m_patchHeight; std::size_t m_numImages; }; } #endif