//===========================================================================
/*!
*
*
* \brief Evidence for model selection of a regularization network/Gaussian process.
*
*
* \author C. Igel, T. Glasmachers, O. Krause
* \date 2007-2012
*
*
* \par Copyright 1995-2017 Shark Development Team
*
*
* This file is part of Shark.
*
*
* Shark is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Shark is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Shark. If not, see .
*
*/
//===========================================================================
#ifndef SHARK_OBJECTIVEFUNCTIONS_NEGATIVEGAUSSIANPROCESSEVIDENCE_H
#define SHARK_OBJECTIVEFUNCTIONS_NEGATIVEGAUSSIANPROCESSEVIDENCE_H
#include
#include
#include
namespace shark {
///
/// \brief Evidence for model selection of a regularization network/Gaussian process.
///
/// Let \f$M\f$ denote the (kernel Gram) covariance matrix and
/// \f$t\f$ the corresponding label vector. For the evidence we have:
/// \f[ E = 1/2 \cdot [ -\log(\det(M)) - t^T M^{-1} t - N \log(2 \pi)] \f]
///
/// The evidence is also known as marginal (log)likelihood. For
/// details, please see:
///
/// C.E. Rasmussen & C.K.I. Williams, Gaussian
/// Processes for Machine Learning, section 5.4, MIT Press, 2006
///
/// C.M. Bishop, Pattern Recognition and Machine Learning, section
/// 6.4.3, Springer, 2006
///
/// The regularization parameter can be encoded in different ways.
/// The exponential encoding is the proper choice for unconstraint optimization.
/// Be careful not to mix up different encodings between trainer and evidence.
template
class NegativeGaussianProcessEvidence : public AbstractObjectiveFunction< RealVector, double >
{
public:
typedef LabeledData DatasetType;
typedef AbstractKernelFunction KernelType;
/// \param dataset: training data for the Gaussian process
/// \param kernel: pointer to external kernel function
/// \param unconstrained: exponential encoding of regularization parameter for unconstraint optimization
NegativeGaussianProcessEvidence(
DatasetType const& dataset,
KernelType* kernel,
bool unconstrained = false
): m_dataset(dataset)
, mep_kernel(kernel)
, m_unconstrained(unconstrained)
{
if (kernel->hasFirstParameterDerivative()) m_features |= HAS_FIRST_DERIVATIVE;
setThreshold(0.);
}
/// \brief From INameable: return the class name.
std::string name() const
{ return "NegativeGaussianProcessEvidence"; }
std::size_t numberOfVariables()const{
return 1+ mep_kernel->numberOfParameters();
}
/// Let \f$M\f$ denote the (kernel Gram) covariance matrix and
/// \f$t\f$ the label vector. For the evidence we have: \f[ E= 1/2 \cdot [ -\log(\det(M)) - t^T M^{-1} t - N \log(2 \pi) ] \f]
double eval(const RealVector& parameters) const {
std::size_t N = m_dataset.numberOfElements();
std::size_t kp = mep_kernel->numberOfParameters();
// check whether argument has right dimensionality
SHARK_ASSERT(1+kp == parameters.size());
// keep track of how often the objective function is called
m_evaluationCounter++;
//set parameters
double betaInv = parameters.back();
if(m_unconstrained)
betaInv = std::exp(betaInv); // for unconstraint optimization
mep_kernel->setParameterVector(subrange(parameters,0,kp));
//generate kernel matrix and label vector
RealMatrix M = calculateRegularizedKernelMatrix(*mep_kernel,m_dataset.inputs(),betaInv);
RealMatrix t = createBatch(m_dataset.labels().elements());
blas::cholesky_decomposition cholesky(M);
//compute the determinant of M using the cholesky factorization M=AA^T:
//ln det(M) = 2 trace(ln A)
double logDet = 2* trace(log(cholesky.lower_factor()));
//we need to compute t^T M^-1 t
//= t^T (AA^T)^-1 t= t^T (A^-T A^-1)=||A^-1 t||^2
//so we will first solve the triangular System Az=t
//and then compute ||z||^2
RealMatrix z = solve(cholesky.lower_factor(),t,blas::lower(), blas::left());
// equation (6.69) on page 311 in the book C.M. Bishop, Pattern Recognition and Machine Learning, Springer, 2006
// e = 1/2 \cdot [ -log(det(M)) - t^T M^{-1} t - N log(2 \pi) ]
double e = 0.5 * (-logDet - norm_sqr(to_vector(z)) - N * std::log(2.0 * M_PI));
// return the *negative* evidence
return -e;
}
/// Let \f$M\f$ denote the regularized (kernel Gram) covariance matrix.
/// For the evidence we have:
/// \f[ E = 1/2 \cdot [ -\log(\det(M)) - t^T M^{-1} t - N \log(2 \pi) ] \f]
/// For a kernel parameter \f$p\f$ and \f$C = \beta^{-1}\f$ we get the derivatives:
/// \f[ dE/dC = 1/2 \cdot [ -tr(M^{-1}) + (M^{-1} t)^2 ] \f]
/// \f[ dE/dp = 1/2 \cdot [ -tr(M^{-1} dM/dp) + t^T (M^{-1} dM/dp M^{-1}) t ] \f]
double evalDerivative(const RealVector& parameters, FirstOrderDerivative& derivative) const {
std::size_t N = m_dataset.numberOfElements();
std::size_t kp = mep_kernel->numberOfParameters();
// check whether argument has right dimensionality
SHARK_ASSERT(1 + kp == parameters.size());
derivative.resize(1 + kp);
// keep track of how often the objective function is called
m_evaluationCounter++;
//set parameters
double betaInv = parameters.back();
if(m_unconstrained)
betaInv = std::exp(betaInv); // for unconstraint optimization
mep_kernel->setParameterVector(subrange(parameters,0,kp));
//generate kernel matrix and label vector
RealMatrix M = calculateRegularizedKernelMatrix(*mep_kernel,m_dataset.inputs(),betaInv);
RealMatrix t = createBatch(m_dataset.labels().elements());
//compute cholesky decomposition of M
blas::cholesky_decomposition cholesky(M);
//we dont need M anymore, so save a lot of memory by freeing the memory of M
M=RealMatrix();
// compute derivative w.r.t. kernel parameters
//the derivative is defined as:
//dE/da = -tr(IM dM/da) +t^T IM dM/da IM t
// where IM is the inverse matrix of M, tr is the trace and a are the parameters of the kernel
//by substituting z = IM t we can expand the operations to:
//dE/da = -(sum_i sum_j IM_ij * dM_ji/da)+(sum_i sum_j dM_ij/da *z_i * z_j)
// = sum_i sum_j (-IM_ij+z_i * z_j) * dM_ij/da
// with W = -IM + zz^T we get
// dE/da = sum_i sum_j W dM_ij/da
//this can be calculated as blockwise derivative.
//compute inverse matrix from the cholesky decomposition
RealMatrix W= blas::identity_matrix(N);
cholesky.solve(W,blas::left());
//calculate z = Wt=M^-1 t
RealMatrix z = prod(W,t);
// W is already initialized as the inverse of M, so we only need
// to change the sign and add z. to calculate W fully
W*=-1;
noalias(W) += prod(z,trans(z));
//now calculate the derivative
RealVector kernelGradient = 0.5*calculateKernelMatrixParameterDerivative(*mep_kernel,m_dataset.inputs(),W);
// compute derivative w.r.t. regularization parameter
//we have: dE/dC = 1/2 * [ -tr(M^{-1}) + (M^{-1} t)^2
// which can also be written as 1/2 tr(W)
double betaInvDerivative = 0.5 * trace(W) ;
if(m_unconstrained)
betaInvDerivative *= betaInv;
//merge both derivatives and since we return the negative evidence, multiply with -1
noalias(derivative) = - (kernelGradient | betaInvDerivative);
// truncate gradient vector
for(std::size_t i=0; inumberOfParameters() + 1, d); // plus one parameter for the prior
}
/// set threshold values for truncating partial derivatives
void setThresholds(RealVector &c) {
SHARK_ASSERT(m_derivativeThresholds.size() == c.size());
m_derivativeThresholds = c;
}
private:
/// pointer to external data set
DatasetType m_dataset;
/// thresholds for setting derivatives to zero
RealVector m_derivativeThresholds;
/// pointer to external kernel function
KernelType* mep_kernel;
/// Indicates whether log() of the regularization parameter is
/// considered. This is useful for unconstraint
/// optimization. The default value is false.
bool m_unconstrained;
};
}
#endif