/*! * * \brief Implements the Squared Hinge Loss function for maximum margin classification. * * * \author Oswin Krause * \date 2014 * * * \par Copyright 1995-2017 Shark Development Team * *

* This file is part of Shark. * * * Shark is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Shark is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with Shark. If not, see . * */ #ifndef SHARK_OBJECTIVEFUNCTIONS_LOSS_SQUAREDHINGELOSS_H #define SHARK_OBJECTIVEFUNCTIONS_LOSS_SQUAREDHINGELOSS_H #include namespace shark { /// /// \brief Squared Hinge-loss for large margin classification /// /// The squared hinge loss for two class problems is defined as \f$ L_i = 1/2\max \{ 0 , (1- y_i f(x_i))^2 \} \f$ where \f$ y_i \in \{-1,1} \f$ is the label /// and \f$ f(x_i) \f$ is the prediction of the model for the ith input. The loss introduces the concept of /// a margin, that is, the point should not only be correctly classified but also not too close to the /// decision boundary. Therefore even correctly classified points are getting punished. /// /// for multi class problems the concept of sums of the relative margin is used: /// \f$ L_i = 1/2 \sum_{c \neq y_i} \max \{ 0 , 1- 1/2 (f_{y_i}(x_i)- f_c(x_i) \} \f$. This loss requires that there is a margin /// between the different class outputs and the functions needs as many outputs as classes. the pre-factor /// 1/2 ensures that in the 2 class 2 output case with a linear function the value of loss is the same as in the single /// output version. /// /// The loss is implemented for class labels 0,1,...,n, even in the binary cases. /// /// The difference to the normal hinge loss is, that the squared hinge-loss is always differentiable. /// However compared to the hinge loss, small margin violations are not as much punished - but big deviations /// are punished much stronger. class SquaredHingeLoss : public AbstractLoss { public: /// constructor SquaredHingeLoss(){ m_features |= base_type::HAS_FIRST_DERIVATIVE; } /// \brief Returns class name "HingeLoss" std::string name() const { return "SquaredHingeLoss"; } ///\brief calculates the sum of all double eval(BatchLabelType const& labels, BatchOutputType const& predictions) const{ std::size_t numInputs = labels.size(); SIZE_CHECK(numInputs == predictions.size1()); double error = 0; //binary case for models with single output if(predictions.size2() == 1){ for(std::size_t i = 0; i != numInputs;++i){ SIZE_CHECK(labels(i) < 2); double y = 2.0*labels(i)-1.0; error += sqr(std::max(0.0,1.0-y*predictions(i,0))); } } else {//multi-class or multiple output case for(std::size_t i = 0; i != numInputs;++i){ SIZE_CHECK(labels(i) < predictions.size2()); for(std::size_t o = 0; o != predictions.size2();++o){ if( o == labels(i)) continue; error += sqr(std::max(0.0,2.0 - predictions(i,labels(i))+predictions(i,o))); } } error/=4; } return error/2; } double evalDerivative(BatchLabelType const& labels, BatchOutputType const& predictions, BatchOutputType& gradient)const{ std::size_t numInputs = labels.size(); std::size_t outputDim = predictions.size2(); SIZE_CHECK(numInputs == predictions.size1()); gradient.resize(numInputs,outputDim); gradient.clear(); double error = 0; //binary case for models with single output if(outputDim == 1){ for(std::size_t i = 0; i != numInputs; ++i){ double y = 2.0*labels(i)-1.0; double sampleLoss = std::max(0.0,1.0-y*predictions(i,0)); if(sampleLoss > 0) gradient(i,0) = -y*sampleLoss; error += sqr(sampleLoss); } } else {//multi-class or multiple output case for(std::size_t i = 0; i != numInputs;++i){ SIZE_CHECK(labels(i) < predictions.size2()); for(std::size_t o = 0; o != predictions.size2();++o){ if( o == labels(i)) continue; double sampleLoss = std::max(0.0,2.0 - predictions(i,labels(i))+predictions(i,o)); if(sampleLoss > 0){ gradient(i,o) = sampleLoss*0.25; gradient(i,labels(i)) -= sampleLoss*0.25; } error+= sqr(sampleLoss); } } error/=4; } return error/2; } }; } #endif