//=========================================================================== /*! * * * \brief Gaussian automatic relevance detection (ARD) kernel * * * * \author T.Glasmachers, O. Krause, M. Tuma * \date 2010-2012 * * * \par Copyright 1995-2017 Shark Development Team * *

* This file is part of Shark. * * * Shark is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Shark is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with Shark. If not, see . * */ //=========================================================================== #ifndef SHARK_MODELS_KERNELS_GAUSSIAN_ARD_KERNEL_H #define SHARK_MODELS_KERNELS_GAUSSIAN_ARD_KERNEL_H #include namespace shark { /// \brief Automatic relevance detection kernel for unconstrained parameter optimization /// /// The automatic relevance detection (ARD) kernel is a general Gaussian kernel with /// diagonal covariance matrix: /// \f$ k(x, z) = \exp(-\sum_i \gamma_i (x_i - z_i)^2) \f$. /// The ARD kernel holds one real-valued parameter \f$ \gamma_i \f$ per input dimension. /// The parameters \f$ p_i \f$ are encoded as \f$ p_i = log(\gamma_i) \f$, allowing for unconstrained /// optimization. Here, the exposed/visible parameters are transformed by an exp before being used in the /// actual computations. /// /// Note that, like all or most models/kernels designed for unconstrained optimization, the /// argument to the constructor corresponds to the value of the true weights, while the set /// and get methods for the parameter vector set the parameterized values and not the true weights. /// /// \todo slow default implementation. Use BLAS3 calls to make things faster template class ARDKernelUnconstrained : public AbstractKernelFunction { private: typedef AbstractKernelFunction base_type; struct InternalState: public State{ RealMatrix kxy; void resize(std::size_t sizeX1,std::size_t sizeX2){ kxy.resize(sizeX1,sizeX2); } }; public: typedef typename base_type::BatchInputType BatchInputType; typedef typename base_type::ConstInputReference ConstInputReference; typedef typename base_type::ConstBatchInputReference ConstBatchInputReference; /// Constructor /// \param dim input dimension /// \param gamma_init initial gamma value for all dimensions (true value, used as passed into ctor) ARDKernelUnconstrained(unsigned int dim, double gamma_init = 1.0){ SHARK_RUNTIME_CHECK( gamma_init > 0, "[ARDKernelUnconstrained::ARDKernelUnconstrained] Expected positive weight."); //init abstract model's informational flags this->m_features|=base_type::HAS_FIRST_PARAMETER_DERIVATIVE; this->m_features|=base_type::HAS_FIRST_INPUT_DERIVATIVE; this->m_features|=base_type::IS_NORMALIZED; //initialize self m_inputDimensions = dim; m_gammas.resize(m_inputDimensions); for ( unsigned int i=0; i createState()const{ return boost::shared_ptr(new InternalState()); } /// convenience methods for setting/getting the actual gamma values RealVector gammaVector() const{ return m_gammas; } void setGammaVector( RealVector const& newGammas ) { #ifndef DNDEBUG SIZE_CHECK( newGammas.size() == m_inputDimensions ); for ( unsigned int i=0; i 0 ); } #endif m_gammas = newGammas; } /// \brief evaluates \f$ k(x,z)\f$ /// /// ARD kernel evaluation /// \f[ k(x, z) = \exp(-\sum_i \gamma_i (x_i - z_i)^2) \f] double eval(ConstInputReference x1, ConstInputReference x2) const{ SIZE_CHECK(x1.size() == x2.size()); SIZE_CHECK(x1.size() == m_inputDimensions); double dmnorm2 = diagonalMahalanobisDistanceSqr(x1, x2, m_gammas); return std::exp(-dmnorm2); } /// \brief evaluates \f$ k(x,z)\f$ for a whole batch /// /// ARD kernel evaluation /// \f[ k(x, z) = \exp(-\sum_i \gamma_i (x_i - z_i)^2) \f] void eval(ConstBatchInputReference batchX1, ConstBatchInputReference batchX2, RealMatrix& result) const{ SIZE_CHECK(batchX1.size2() == batchX2.size2()); SIZE_CHECK(batchX1.size2() == m_inputDimensions); std::size_t sizeX1 = batchX1.size1(); std::size_t sizeX2 = batchX2.size1(); ensure_size(result,sizeX1,sizeX2); //todo: implement fast version of diagonalMahalanobisDistanceSqr for matrices for(std::size_t i = 0; i != sizeX1; ++i){ for(std::size_t j = 0; j != sizeX2; ++j){ double dmnorm2 = diagonalMahalanobisDistanceSqr(row(batchX1,i), row(batchX2,j), m_gammas); result(i,j)=std::exp(-dmnorm2); } } } /// \brief evaluates \f$ k(x,z)\f$ for a whole batch /// /// ARD kernel evaluation /// \f[ k(x, z) = \exp(-\sum_i \gamma_i (x_i - z_i)^2) \f] void eval(ConstBatchInputReference batchX1, ConstBatchInputReference batchX2, RealMatrix& result, State& state) const{ SIZE_CHECK(batchX1.size2() == batchX2.size2()); SIZE_CHECK(batchX1.size2() == m_inputDimensions); std::size_t sizeX1 = batchX1.size1(); std::size_t sizeX2 = batchX2.size1(); InternalState& s = state.toState(); s.resize(sizeX1,sizeX2); ensure_size(result,sizeX1,sizeX2); //todo: implement fast version of diagonalMahalanobisDistanceSqr for matrices for(std::size_t i = 0; i != sizeX1; ++i){ for(std::size_t j = 0; j != sizeX2; ++j){ double dmnorm2 = diagonalMahalanobisDistanceSqr(row(batchX1,i), row(batchX2,j), m_gammas); result(i,j) = std::exp(-dmnorm2); s.kxy(i,j) = result(i,j); } } } /// \brief evaluates \f$ \frac {\partial k(x,z)}{\partial \sqrt{\gamma_i}}\f$ weighted over a whole batch /// /// Since the ARD kernel is parametrized for unconstrained optimization, we return /// the derivative w.r.t. the parameters \f$ p_i \f$, where \f$ p_i^2 = \gamma_i \f$. /// /// \f[ \frac {\partial k(x,z)}{\partial p_i} = -2 p_i (x_i - z_i)^2 \cdot k(x,z) \f] void weightedParameterDerivative( ConstBatchInputReference batchX1, ConstBatchInputReference batchX2, RealMatrix const& coefficients, State const& state, RealVector& gradient ) const{ SIZE_CHECK(batchX1.size2() == batchX2.size2()); SIZE_CHECK(batchX1.size2() == m_inputDimensions); std::size_t sizeX1 = batchX1.size1(); std::size_t sizeX2 = batchX2.size1(); ensure_size(gradient, m_inputDimensions ); gradient.clear(); InternalState const& s = state.toState(); for(std::size_t i = 0; i != sizeX1; ++i){ for(std::size_t j = 0; j != sizeX2; ++j){ double coeff = coefficients(i,j) * s.kxy(i,j); gradient -= coeff * m_gammas * sqr(row(batchX1,i)-row(batchX2,j)); } } } /// \brief evaluates \f$ \frac {\partial k(x,z)}{\partial x}\f$ /// /// first derivative of ARD kernel wrt the first input pattern /// \f[ \frac {\partial k(x,z)}{\partial x} = -2 \gamma_i \left( x_i - z_i \right)\cdot k(x,z) \f] void weightedInputDerivative( ConstBatchInputReference batchX1, ConstBatchInputReference batchX2, RealMatrix const& coefficientsX2, State const& state, BatchInputType& gradient ) const{ SIZE_CHECK(batchX1.size2() == batchX2.size2()); SIZE_CHECK(batchX1.size2() == m_inputDimensions); std::size_t sizeX1 = batchX1.size1(); std::size_t sizeX2 = batchX2.size1(); InternalState const& s = state.toState(); ensure_size(gradient, sizeX1, m_inputDimensions ); gradient.clear(); for(std::size_t i = 0; i != sizeX1; ++i){ for(std::size_t j = 0; j != sizeX2; ++j){ double coeff = coefficientsX2(i,j) * s.kxy(i,j); row(gradient,i) += coeff * m_gammas * (row(batchX1,i)-row(batchX2,j)); } } gradient *= -2.0; } void read(InArchive& ar){ ar >> m_gammas; ar >> m_inputDimensions; } void write(OutArchive& ar) const{ ar << m_gammas; ar << m_inputDimensions; } protected: RealVector m_gammas; ///< kernel bandwidth parameters, one for each input dimension. std::size_t m_inputDimensions; ///< how many input dimensions = how many bandwidth parameters }; typedef ARDKernelUnconstrained<> DenseARDKernel; typedef ARDKernelUnconstrained CompressedARDKernel; } #endif