//===========================================================================
/*!
*
*
* \brief Derivative of a C-SVM hypothesis w.r.t. its hyperparameters.
*
* \par
* This class provides two main member functions for computing the
* derivative of a C-SVM hypothesis w.r.t. its hyperparameters. First,
* the derivative is prepared in general. Then, the derivative can be
* computed comparatively cheaply for any input sample. Needs to be
* supplied with pointers to a KernelExpansion and CSvmTrainer.
*
*
*
* \author M. Tuma, T. Glasmachers
* \date 2007-2012
*
*
* \par Copyright 1995-2017 Shark Development Team
*
*
* This file is part of Shark.
*
*
* Shark is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Shark is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Shark. If not, see .
*
*/
//===========================================================================
#ifndef SHARK_MODELS_CSVMDERIVATIVE_H
#define SHARK_MODELS_CSVMDERIVATIVE_H
#include
#include
#include
#include
namespace shark {
/// \brief
///
/// This class provides two main member functions for computing the
/// derivative of a C-SVM hypothesis w.r.t. its hyperparameters.
/// The constructor takes a pointer to a KernelClassifier and an SvmTrainer,
/// in the assumption that the former was trained by the latter. It heavily
/// accesses their members to calculate the derivative of the alpha and offset
/// values w.r.t. the SVM hyperparameters, that is, the regularization
/// parameter C and the kernel parameters. This is done in the member function
/// prepareCSvmParameterDerivative called by the constructor. After this initial,
/// heavier computation step, modelCSvmParameterDerivative can be called on an
/// input sample to the SVM model, and the method will yield the derivative of
/// the hypothesis w.r.t. the SVM hyperparameters.
///
/// \tparam InputType Same basis type as the kernel expansion's
/// \tparam CacheType While the basic cache type defaults to float in the QP algorithms, it here defaults to double, because the SVM derivative benefits a lot from higher precision.
///
template< class InputType, class CacheType = double >
class CSvmDerivative : public ISerializable, public INameable
{
public:
typedef CacheType QpFloatType;
typedef KernelClassifier KeType;
typedef AbstractKernelFunction KernelType;
typedef CSvmTrainer TrainerType;
protected:
// key external members through which main information is obtained
KernelExpansion* mep_ke; ///< pointer to the KernelExpansion which has to have been been trained by the below SvmTrainer
TrainerType* mep_tr; ///< pointer to the SvmTrainer with which the above KernelExpansion has to have been trained
KernelType* mep_k; ///< convenience pointer to the underlying kernel function
RealMatrix& m_alpha; ///< convenience reference to the alpha values of the KernelExpansion
const Data& m_basis; ///< convenience reference to the underlying data of the KernelExpansion
const RealVector& m_db_dParams_from_solver; ///< convenience access to the correction term from the solver, for the rare case that there are no free SVs
// convenience copies from the CSvmTrainer and the underlying kernel function
double m_C; ///< the regularization parameter value with which the SvmTrainer trained the KernelExpansion
bool m_unconstrained; ///< is the unconstrained flag of the SvmTrainer set? Influences the derivatives!
std::size_t m_nkp; ///< convenience member holding the Number of Kernel Parameters.
std::size_t m_nhp; ///< convenience member holding the Number of Hyper Parameters.
// information calculated from the KernelExpansion state in the prepareDerivative-step
std::size_t m_noofFreeSVs; ///< number of free SVs
std::size_t m_noofBoundedSVs; ///< number of bounded SVs
std::vector< std::size_t > m_freeAlphaIndices; ///< indices of free SVs
std::vector< std::size_t > m_boundedAlphaIndices; ///< indices of bounded SVs
RealVector m_freeAlphas; ///< free non-SV alpha values
RealVector m_boundedAlphas; ///< bounded non-SV alpha values
RealVector m_boundedLabels; ///< labels of bounded non-SVs
/// Main member and result, computed in the prepareDerivative-step:
/// Stores the derivative of the **free** alphas w.r.t. SVM hyperparameters as obtained
/// through the CSvmTrainer (for C) and through the kernel (for the kernel parameters).
/// Each row corresponds to one **free** alpha, each column to one hyperparameter.
/// The **last** column is the derivative of (free_alphas, b) w.r.t C. All **previous**
/// columns are w.r.t. the kernel parameters.
RealMatrix m_d_alphab_d_theta;
public:
/// Constructor. Only sets up the main pointers and references to the external instances and data, and
/// performs basic sanity checks.
/// \param ke pointer to the KernelExpansion which has to have been been trained by the below SvmTrainer
/// \param trainer pointer to the SvmTrainer with which the above KernelExpansion has to have been trained
CSvmDerivative( KeType* ke, TrainerType* trainer )
: mep_ke( &ke->decisionFunction() ),
mep_tr( trainer ),
mep_k( trainer->kernel() ),
m_alpha( mep_ke->alpha() ),
m_basis( mep_ke->basis() ),
m_db_dParams_from_solver( trainer->get_db_dParams() ),
m_C ( trainer->C() ),
m_unconstrained( trainer->isUnconstrained() ),
m_nkp( trainer->kernel()->numberOfParameters() ),
m_nhp( trainer->kernel()->numberOfParameters()+1 )
{
SHARK_RUNTIME_CHECK( mep_ke->kernel() == trainer->kernel(), "[CSvmDerivative::CSvmDerivative] KernelExpansion and SvmTrainer must use the same KernelFunction.");
SHARK_RUNTIME_CHECK( mep_ke != NULL, "[CSvmDerivative::CSvmDerivative] KernelExpansion cannot be NULL.");
SHARK_RUNTIME_CHECK( mep_ke->outputShape().numElements() == 1, "[CSvmDerivative::CSvmDerivative] only defined for binary SVMs.");
SHARK_RUNTIME_CHECK( mep_ke->hasOffset() == 1, "[CSvmDerivative::CSvmDerivative] only defined for SVMs with offset.");
SHARK_RUNTIME_CHECK( m_alpha.size2() == 1, "[CSvmDerivative::CSvmDerivative] this class is only defined for binary SVMs.");
prepareCSvmParameterDerivative(); //main
}
/// \brief From INameable: return the class name.
std::string name() const
{ return "CSvmDerivative"; }
inline const KeType* ke() { return mep_ke; }
inline const TrainerType* trainer() { return mep_tr; }
//! Computes the derivative of the model w.r.t. regularization and kernel parameters.
//! Be sure to call prepareCSvmParameterDerivative after SVM training and before calling this function!
//! \param input an example to be scored by the SVM
//! \param derivative a vector of derivatives of the score. The last entry is w.r.t. C.
void modelCSvmParameterDerivative(const InputType& input, RealVector& derivative )
{
// create temporary batch helpers
RealMatrix unit_weights(1,1,1.0);
RealMatrix bof_results(1,1);
typename Batch::type bof_xi = Batch::createBatch(input,1);
typename Batch::type bof_input = Batch::createBatch(input,1);
getBatchElement(bof_input, 0) = input; //fixed over entire function scope
// init helpers
RealVector der( m_nhp );
boost::shared_ptr state = mep_k->createState(); //state from eval and for derivatives
derivative.resize( m_nhp );
// start calculating derivative
noalias(derivative) = row(m_d_alphab_d_theta,m_noofFreeSVs); //without much thinking, we add db/d(\theta) to all derivatives
// first: go through free SVs and add their contributions (the actual ones, which use the matrix d_alphab_d_theta)
for ( std::size_t i=0; ieval( bof_input, bof_xi, bof_results, *state );
double ker = bof_results(0,0);
double cur_alpha = m_freeAlphas(i);
mep_k->weightedParameterDerivative( bof_input, bof_xi, unit_weights, *state, der );
noalias(derivative) += ker * row(m_d_alphab_d_theta,i); //for C, simply add up the individual contributions
noalias(subrange(derivative,0,m_nkp))+= cur_alpha*der;
}
// second: go through all bounded SVs and add their "trivial" derivative contributions
for ( std::size_t i=0; ieval( bof_input, bof_xi, bof_results, *state );
double ker = bof_results(0,0);
double cur_label = m_boundedLabels(i);
mep_k->weightedParameterDerivative( bof_input, bof_xi, unit_weights, *state, der );
derivative( m_nkp ) += ker * cur_label; //deriv of bounded alpha w.r.t. C is simply the label
noalias(subrange(derivative,0,m_nkp))+= cur_label * m_C * der;
}
if ( m_unconstrained )
derivative( m_nkp ) *= m_C; //compensate for log encoding via chain rule
//(the kernel parameter derivatives are correctly differentiated according to their
// respective encoding via the kernel's derivative, so we don't need to correct for those.)
// in some rare cases, there are no free SVs and we have to manually correct the derivatives using a correcting term from the SvmTrainer.
if ( m_noofFreeSVs == 0 ) {
noalias(derivative) += m_db_dParams_from_solver;
}
}
//! Whether there are free SVs in the solution. Useful to monitor for degenerate solutions
//! with only bounded and no free SVs. Be sure to call prepareCSvmParameterDerivative after
//! SVM training and before calling this function.
bool hasFreeSVs() { return ( m_noofFreeSVs != 0 ); }
//! Whether there are bounded SVs in the solution. Useful to monitor for degenerate solutions
//! with only bounded and no free SVs. Be sure to call prepareCSvmParameterDerivative after
//! SVM training and before calling this function.
bool hasBoundedSVs() { return ( m_noofBoundedSVs != 0 ); }
/// Access to the matrix of SVM coefficients' derivatives. Derivative w.r.t. C is last.
const RealMatrix& get_dalphab_dtheta() {
return m_d_alphab_d_theta;
}
/// From ISerializable, reads a network from an archive
virtual void read( InArchive & archive ) {
}
/// From ISerializable, writes a network to an archive
virtual void write( OutArchive & archive ) const {
}
private:
/////////// DERIVATIVE OF BINARY (C-)SVM ////////////////////
//! Fill m_d_alphab_d_theta, the matrix of derivatives of free SVs w.r.t. C-SVM hyperparameters
//! as obtained through the CSvmTrainer (for C) and through the kernel (for the kernel parameters).
//! \par
//! Note: we follow the alpha-encoding-conventions of Glasmacher's dissertation, where the alpha values
//! are re-encoded by multiplying each with the corresponding label
//!
void prepareCSvmParameterDerivative() {
// init convenience size indicators
std::size_t numberOfAlphas = m_alpha.size1();
// first round through alphas: count free and bounded SVs
for ( std::size_t i=0; ioffset(0);
for ( std::size_t i=0; i 0.0) ? 1.0 : -1.0 );
}
//if there are no free support vectors, we are done.
if ( m_noofFreeSVs == 0 ) {
return;
}
// set up helper variables.
// See Tobias Glasmacher's dissertation, chapter 9.3, for a calculation of the derivatives as well as
// for a definition of these variables. -> It's very easy to follow this code with that chapter open.
// The Keerthi-paper "Efficient method for gradient-based..." is also highly recommended for cross-reference.
RealVector der( m_nkp ); //derivative storage helper
boost::shared_ptr state = mep_k->createState(); //state object for derivatives
// create temporary batch helpers
RealMatrix unit_weights(1,1,1.0);
RealMatrix bof_results(1,1);
typename Batch::type bof_xi;
typename Batch::type bof_xj;
if ( m_noofFreeSVs != 0 ) {
bof_xi = Batch::createBatch( m_basis.element(m_freeAlphaIndices[0]), 1 ); //any input works
bof_xj = Batch::createBatch( m_basis.element(m_freeAlphaIndices[0]), 1 ); //any input works
} else if ( m_noofBoundedSVs != 0 ) {
bof_xi = Batch::createBatch( m_basis.element(m_boundedAlphaIndices[0]), 1 ); //any input works
bof_xj = Batch::createBatch( m_basis.element(m_boundedAlphaIndices[0]), 1 ); //any input works
} else {
throw SHARKEXCEPTION("[CSvmDerivative::prepareCSvmParameterDerivative] Something went very wrong.");
}
// initialize H and dH
//H is the the matrix
//H = (K 1; 1 0)
// where the ones are row or column vectors and 0 is a scalar.
// K is the kernel matrix spanned by the free support vectors
RealMatrix H( m_noofFreeSVs + 1, m_noofFreeSVs + 1,0.0);
std::vector< RealMatrix > dH( m_nkp , RealMatrix(m_noofFreeSVs+1, m_noofFreeSVs+1));
for ( std::size_t i=0; ieval( bof_xi, bof_xj, bof_results, *state );
H( i,j ) = H( j,i ) = bof_results(0,0);
mep_k->weightedParameterDerivative( bof_xi, bof_xj, unit_weights, *state, der );
for ( std::size_t k=0; keval( bof_xi, bof_xi, bof_results, *state );
H( i,i ) = bof_results(0,0);
H( i, m_noofFreeSVs) = H( m_noofFreeSVs, i) = 1.0;
mep_k->weightedParameterDerivative( bof_xi, bof_xi, unit_weights, *state, der );
for ( std::size_t k=0; k dR( m_nkp, RealMatrix(m_noofFreeSVs+1, m_noofBoundedSVs));
for ( std::size_t i=0; ieval( bof_xi, bof_xj, bof_results, *state );
R( j,i ) = bof_results(0,0);
mep_k->weightedParameterDerivative( bof_xi, bof_xj, unit_weights, *state, der );
for ( std::size_t k=0; k 0 ) {
noalias(column(m_d_alphab_d_theta,m_nkp)) = prod( R, m_boundedLabels);
}
// compute the derivative of (\alpha, b) w.r.t. the kernel parameters
for ( std::size_t k=0; k 0)
noalias(sum) += prod( dR[k], m_boundedAlphas ); // sum += dR * \alpha_r , i.e., the C*y_g is expressed as alpha_g
//fill the remaining columns of the derivative matrix (except the last, which is for C)
noalias(column(m_d_alphab_d_theta,k)) = sum;
}
//lastly solve the system Hx_i = b_i
// MAJOR STEP: this is the achilles heel of the current implementation, cf. keerthi 2007
// TODO: mtq: explore ways for speed-up..
//compute via moore penrose pseudo-inverse
noalias(m_d_alphab_d_theta) = - solveH(H,m_d_alphab_d_theta);
// that's all, folks; we're done.
}
RealMatrix solveH(RealMatrix const& H, RealMatrix const& rhs){
//implement using moore penrose pseudo inverse
RealMatrix HTH=prod(trans(H),H);
RealMatrix result = solve(HTH,prod(H,rhs),blas::symm_semi_pos_def(),blas::left());
return result;
}
};//class
}//namespace
#endif