//=========================================================================== /*! * * * \brief Derivative of a C-SVM hypothesis w.r.t. its hyperparameters. * * \par * This class provides two main member functions for computing the * derivative of a C-SVM hypothesis w.r.t. its hyperparameters. First, * the derivative is prepared in general. Then, the derivative can be * computed comparatively cheaply for any input sample. Needs to be * supplied with pointers to a KernelExpansion and CSvmTrainer. * * * * \author M. Tuma, T. Glasmachers * \date 2007-2012 * * * \par Copyright 1995-2017 Shark Development Team * *

* This file is part of Shark. * * * Shark is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Shark is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with Shark. If not, see . * */ //=========================================================================== #ifndef SHARK_MODELS_CSVMDERIVATIVE_H #define SHARK_MODELS_CSVMDERIVATIVE_H #include #include #include #include namespace shark { /// \brief /// /// This class provides two main member functions for computing the /// derivative of a C-SVM hypothesis w.r.t. its hyperparameters. /// The constructor takes a pointer to a KernelClassifier and an SvmTrainer, /// in the assumption that the former was trained by the latter. It heavily /// accesses their members to calculate the derivative of the alpha and offset /// values w.r.t. the SVM hyperparameters, that is, the regularization /// parameter C and the kernel parameters. This is done in the member function /// prepareCSvmParameterDerivative called by the constructor. After this initial, /// heavier computation step, modelCSvmParameterDerivative can be called on an /// input sample to the SVM model, and the method will yield the derivative of /// the hypothesis w.r.t. the SVM hyperparameters. /// /// \tparam InputType Same basis type as the kernel expansion's /// \tparam CacheType While the basic cache type defaults to float in the QP algorithms, it here defaults to double, because the SVM derivative benefits a lot from higher precision. /// template< class InputType, class CacheType = double > class CSvmDerivative : public ISerializable, public INameable { public: typedef CacheType QpFloatType; typedef KernelClassifier KeType; typedef AbstractKernelFunction KernelType; typedef CSvmTrainer TrainerType; protected: // key external members through which main information is obtained KernelExpansion* mep_ke; ///< pointer to the KernelExpansion which has to have been been trained by the below SvmTrainer TrainerType* mep_tr; ///< pointer to the SvmTrainer with which the above KernelExpansion has to have been trained KernelType* mep_k; ///< convenience pointer to the underlying kernel function RealMatrix& m_alpha; ///< convenience reference to the alpha values of the KernelExpansion const Data& m_basis; ///< convenience reference to the underlying data of the KernelExpansion const RealVector& m_db_dParams_from_solver; ///< convenience access to the correction term from the solver, for the rare case that there are no free SVs // convenience copies from the CSvmTrainer and the underlying kernel function double m_C; ///< the regularization parameter value with which the SvmTrainer trained the KernelExpansion bool m_unconstrained; ///< is the unconstrained flag of the SvmTrainer set? Influences the derivatives! std::size_t m_nkp; ///< convenience member holding the Number of Kernel Parameters. std::size_t m_nhp; ///< convenience member holding the Number of Hyper Parameters. // information calculated from the KernelExpansion state in the prepareDerivative-step std::size_t m_noofFreeSVs; ///< number of free SVs std::size_t m_noofBoundedSVs; ///< number of bounded SVs std::vector< std::size_t > m_freeAlphaIndices; ///< indices of free SVs std::vector< std::size_t > m_boundedAlphaIndices; ///< indices of bounded SVs RealVector m_freeAlphas; ///< free non-SV alpha values RealVector m_boundedAlphas; ///< bounded non-SV alpha values RealVector m_boundedLabels; ///< labels of bounded non-SVs /// Main member and result, computed in the prepareDerivative-step: /// Stores the derivative of the **free** alphas w.r.t. SVM hyperparameters as obtained /// through the CSvmTrainer (for C) and through the kernel (for the kernel parameters). /// Each row corresponds to one **free** alpha, each column to one hyperparameter. /// The **last** column is the derivative of (free_alphas, b) w.r.t C. All **previous** /// columns are w.r.t. the kernel parameters. RealMatrix m_d_alphab_d_theta; public: /// Constructor. Only sets up the main pointers and references to the external instances and data, and /// performs basic sanity checks. /// \param ke pointer to the KernelExpansion which has to have been been trained by the below SvmTrainer /// \param trainer pointer to the SvmTrainer with which the above KernelExpansion has to have been trained CSvmDerivative( KeType* ke, TrainerType* trainer ) : mep_ke( &ke->decisionFunction() ), mep_tr( trainer ), mep_k( trainer->kernel() ), m_alpha( mep_ke->alpha() ), m_basis( mep_ke->basis() ), m_db_dParams_from_solver( trainer->get_db_dParams() ), m_C ( trainer->C() ), m_unconstrained( trainer->isUnconstrained() ), m_nkp( trainer->kernel()->numberOfParameters() ), m_nhp( trainer->kernel()->numberOfParameters()+1 ) { SHARK_RUNTIME_CHECK( mep_ke->kernel() == trainer->kernel(), "[CSvmDerivative::CSvmDerivative] KernelExpansion and SvmTrainer must use the same KernelFunction."); SHARK_RUNTIME_CHECK( mep_ke != NULL, "[CSvmDerivative::CSvmDerivative] KernelExpansion cannot be NULL."); SHARK_RUNTIME_CHECK( mep_ke->outputShape().numElements() == 1, "[CSvmDerivative::CSvmDerivative] only defined for binary SVMs."); SHARK_RUNTIME_CHECK( mep_ke->hasOffset() == 1, "[CSvmDerivative::CSvmDerivative] only defined for SVMs with offset."); SHARK_RUNTIME_CHECK( m_alpha.size2() == 1, "[CSvmDerivative::CSvmDerivative] this class is only defined for binary SVMs."); prepareCSvmParameterDerivative(); //main } /// \brief From INameable: return the class name. std::string name() const { return "CSvmDerivative"; } inline const KeType* ke() { return mep_ke; } inline const TrainerType* trainer() { return mep_tr; } //! Computes the derivative of the model w.r.t. regularization and kernel parameters. //! Be sure to call prepareCSvmParameterDerivative after SVM training and before calling this function! //! \param input an example to be scored by the SVM //! \param derivative a vector of derivatives of the score. The last entry is w.r.t. C. void modelCSvmParameterDerivative(const InputType& input, RealVector& derivative ) { // create temporary batch helpers RealMatrix unit_weights(1,1,1.0); RealMatrix bof_results(1,1); typename Batch::type bof_xi = Batch::createBatch(input,1); typename Batch::type bof_input = Batch::createBatch(input,1); getBatchElement(bof_input, 0) = input; //fixed over entire function scope // init helpers RealVector der( m_nhp ); boost::shared_ptr state = mep_k->createState(); //state from eval and for derivatives derivative.resize( m_nhp ); // start calculating derivative noalias(derivative) = row(m_d_alphab_d_theta,m_noofFreeSVs); //without much thinking, we add db/d(\theta) to all derivatives // first: go through free SVs and add their contributions (the actual ones, which use the matrix d_alphab_d_theta) for ( std::size_t i=0; ieval( bof_input, bof_xi, bof_results, *state ); double ker = bof_results(0,0); double cur_alpha = m_freeAlphas(i); mep_k->weightedParameterDerivative( bof_input, bof_xi, unit_weights, *state, der ); noalias(derivative) += ker * row(m_d_alphab_d_theta,i); //for C, simply add up the individual contributions noalias(subrange(derivative,0,m_nkp))+= cur_alpha*der; } // second: go through all bounded SVs and add their "trivial" derivative contributions for ( std::size_t i=0; ieval( bof_input, bof_xi, bof_results, *state ); double ker = bof_results(0,0); double cur_label = m_boundedLabels(i); mep_k->weightedParameterDerivative( bof_input, bof_xi, unit_weights, *state, der ); derivative( m_nkp ) += ker * cur_label; //deriv of bounded alpha w.r.t. C is simply the label noalias(subrange(derivative,0,m_nkp))+= cur_label * m_C * der; } if ( m_unconstrained ) derivative( m_nkp ) *= m_C; //compensate for log encoding via chain rule //(the kernel parameter derivatives are correctly differentiated according to their // respective encoding via the kernel's derivative, so we don't need to correct for those.) // in some rare cases, there are no free SVs and we have to manually correct the derivatives using a correcting term from the SvmTrainer. if ( m_noofFreeSVs == 0 ) { noalias(derivative) += m_db_dParams_from_solver; } } //! Whether there are free SVs in the solution. Useful to monitor for degenerate solutions //! with only bounded and no free SVs. Be sure to call prepareCSvmParameterDerivative after //! SVM training and before calling this function. bool hasFreeSVs() { return ( m_noofFreeSVs != 0 ); } //! Whether there are bounded SVs in the solution. Useful to monitor for degenerate solutions //! with only bounded and no free SVs. Be sure to call prepareCSvmParameterDerivative after //! SVM training and before calling this function. bool hasBoundedSVs() { return ( m_noofBoundedSVs != 0 ); } /// Access to the matrix of SVM coefficients' derivatives. Derivative w.r.t. C is last. const RealMatrix& get_dalphab_dtheta() { return m_d_alphab_d_theta; } /// From ISerializable, reads a network from an archive virtual void read( InArchive & archive ) { } /// From ISerializable, writes a network to an archive virtual void write( OutArchive & archive ) const { } private: /////////// DERIVATIVE OF BINARY (C-)SVM //////////////////// //! Fill m_d_alphab_d_theta, the matrix of derivatives of free SVs w.r.t. C-SVM hyperparameters //! as obtained through the CSvmTrainer (for C) and through the kernel (for the kernel parameters). //! \par //! Note: we follow the alpha-encoding-conventions of Glasmacher's dissertation, where the alpha values //! are re-encoded by multiplying each with the corresponding label //! void prepareCSvmParameterDerivative() { // init convenience size indicators std::size_t numberOfAlphas = m_alpha.size1(); // first round through alphas: count free and bounded SVs for ( std::size_t i=0; ioffset(0); for ( std::size_t i=0; i 0.0) ? 1.0 : -1.0 ); } //if there are no free support vectors, we are done. if ( m_noofFreeSVs == 0 ) { return; } // set up helper variables. // See Tobias Glasmacher's dissertation, chapter 9.3, for a calculation of the derivatives as well as // for a definition of these variables. -> It's very easy to follow this code with that chapter open. // The Keerthi-paper "Efficient method for gradient-based..." is also highly recommended for cross-reference. RealVector der( m_nkp ); //derivative storage helper boost::shared_ptr state = mep_k->createState(); //state object for derivatives // create temporary batch helpers RealMatrix unit_weights(1,1,1.0); RealMatrix bof_results(1,1); typename Batch::type bof_xi; typename Batch::type bof_xj; if ( m_noofFreeSVs != 0 ) { bof_xi = Batch::createBatch( m_basis.element(m_freeAlphaIndices[0]), 1 ); //any input works bof_xj = Batch::createBatch( m_basis.element(m_freeAlphaIndices[0]), 1 ); //any input works } else if ( m_noofBoundedSVs != 0 ) { bof_xi = Batch::createBatch( m_basis.element(m_boundedAlphaIndices[0]), 1 ); //any input works bof_xj = Batch::createBatch( m_basis.element(m_boundedAlphaIndices[0]), 1 ); //any input works } else { throw SHARKEXCEPTION("[CSvmDerivative::prepareCSvmParameterDerivative] Something went very wrong."); } // initialize H and dH //H is the the matrix //H = (K 1; 1 0) // where the ones are row or column vectors and 0 is a scalar. // K is the kernel matrix spanned by the free support vectors RealMatrix H( m_noofFreeSVs + 1, m_noofFreeSVs + 1,0.0); std::vector< RealMatrix > dH( m_nkp , RealMatrix(m_noofFreeSVs+1, m_noofFreeSVs+1)); for ( std::size_t i=0; ieval( bof_xi, bof_xj, bof_results, *state ); H( i,j ) = H( j,i ) = bof_results(0,0); mep_k->weightedParameterDerivative( bof_xi, bof_xj, unit_weights, *state, der ); for ( std::size_t k=0; keval( bof_xi, bof_xi, bof_results, *state ); H( i,i ) = bof_results(0,0); H( i, m_noofFreeSVs) = H( m_noofFreeSVs, i) = 1.0; mep_k->weightedParameterDerivative( bof_xi, bof_xi, unit_weights, *state, der ); for ( std::size_t k=0; k dR( m_nkp, RealMatrix(m_noofFreeSVs+1, m_noofBoundedSVs)); for ( std::size_t i=0; ieval( bof_xi, bof_xj, bof_results, *state ); R( j,i ) = bof_results(0,0); mep_k->weightedParameterDerivative( bof_xi, bof_xj, unit_weights, *state, der ); for ( std::size_t k=0; k 0 ) { noalias(column(m_d_alphab_d_theta,m_nkp)) = prod( R, m_boundedLabels); } // compute the derivative of (\alpha, b) w.r.t. the kernel parameters for ( std::size_t k=0; k 0) noalias(sum) += prod( dR[k], m_boundedAlphas ); // sum += dR * \alpha_r , i.e., the C*y_g is expressed as alpha_g //fill the remaining columns of the derivative matrix (except the last, which is for C) noalias(column(m_d_alphab_d_theta,k)) = sum; } //lastly solve the system Hx_i = b_i // MAJOR STEP: this is the achilles heel of the current implementation, cf. keerthi 2007 // TODO: mtq: explore ways for speed-up.. //compute via moore penrose pseudo-inverse noalias(m_d_alphab_d_theta) = - solveH(H,m_d_alphab_d_theta); // that's all, folks; we're done. } RealMatrix solveH(RealMatrix const& H, RealMatrix const& rhs){ //implement using moore penrose pseudo inverse RealMatrix HTH=prod(trans(H),H); RealMatrix result = solve(HTH,prod(H,rhs),blas::symm_semi_pos_def(),blas::left()); return result; } };//class }//namespace #endif