//===========================================================================
/*!
*
*
* \brief Error measure for classification tasks that can be used
* as the objective function for training.
*
*
*
*
* \author -
* \date -
*
*
* \par Copyright 1995-2017 Shark Development Team
*
*
* This file is part of Shark.
*
*
* Shark is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Shark is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Shark. If not, see .
*
*/
#ifndef SHARK_OBJECTIVEFUNCTIONS_LOSS_CROSS_ENTROPY_H
#define SHARK_OBJECTIVEFUNCTIONS_LOSS_CROSS_ENTROPY_H
#include
namespace shark{
/*!
* \brief Error measure for classification tasks that can be used
* as the objective function for training.
*
* If your model should return a vector whose components reflect the
* logarithmic conditional probabilities of class membership given any input vector
* 'CrossEntropy' is the adequate error measure for model-training.
* For \em C>1 classes the loss function is defined as
* \f[
* E = - \ln \frac{\exp{x_c}} {\sum_{c^{\prime}=1}^C \exp{x_c^{\prime}}} = - x_c + \ln \sum_{c^{\prime}=1}^C \exp{x_c^{\prime}}
* \f]
* where \em x is the prediction vector of the model and \em c is the class label. In the case of only one
* model output and binary classification, another more numerically stable formulation is used:
* \f[
* E = \ln(1+ e^{-yx})
* \f]
* here, \em y are class labels between -1 and 1 and y = -2 c+1. The reason why this is numerically more stable is,
* that when \f$ e^{-yx} \f$ is big, the error function is well approximated by the linear function \em x. Also if
* the exponential is very small, the case \f$ \ln(0) \f$ is avoided.
*
* If the class labels are integers, they must be starting from 0. If class labels are vectors, there must be a proper
* probability vector. i.e. values must be bigger or equal to zero and sum to one. This incldues one-hot-encoding of labels.
* Also for theoretical reasons, the output neurons of a neural Network that is trained with this loss should be linear.
*/
template
class CrossEntropy;
template
class CrossEntropy : public AbstractLoss
{
private:
typedef AbstractLoss base_type;
typedef typename base_type::ConstLabelReference ConstLabelReference;
typedef typename base_type::ConstOutputReference ConstOutputReference;
typedef typename base_type::BatchOutputType BatchOutputType;
typedef typename base_type::MatrixType MatrixType;
//uses different formula to compute the binary case for 1 output.
//should be numerically more stable
//formula: ln(1+exp(-yx)) with y = -1/1
double evalError(double label,double exponential,double value) const {
if(value*label < -200 ){
//below this, we might get numeric instabilities
//but we know, that ln(1+exp(x)) converges to x for big arguments
return - value * label;
}
return std::log(1+exponential);
}
public:
CrossEntropy()
{ this->m_features |= base_type::HAS_FIRST_DERIVATIVE;}
/// \brief From INameable: return the class name.
std::string name() const
{ return "CrossEntropy"; }
// annoyingness of C++ templates
using base_type::eval;
double eval(UIntVector const& target, BatchOutputType const& prediction) const {
double error = 0;
for(std::size_t i = 0; i != prediction.size1(); ++i){
error += eval(target(i), row(prediction,i));
}
return error;
}
double eval( ConstLabelReference target, ConstOutputReference prediction)const{
if ( prediction.size() == 1 )
{
RANGE_CHECK ( target < 2 );
double label = 2.0 * target - 1; //converts labels from 0/1 to -1/1
double exponential = std::exp( -label * prediction(0) );
return evalError(label,exponential,prediction(0 ));
}else{
RANGE_CHECK ( target < prediction.size() );
//calculate the log norm in a numerically stable way
//we subtract the maximum prior to exponentiation to
//ensure that the exponentiation result will still fit in double
double maximum = max(prediction);
double logNorm = sum(exp(prediction-maximum));
logNorm = std::log(logNorm) + maximum;
return logNorm - prediction(target);
}
}
double evalDerivative(UIntVector const& target, BatchOutputType const& prediction, BatchOutputType& gradient) const {
gradient.resize(prediction.size1(),prediction.size2());
if ( prediction.size2() == 1 )
{
double error = 0;
for(std::size_t i = 0; i != prediction.size1(); ++i){
RANGE_CHECK ( target(i) < 2 );
double label = 2 * static_cast(target(i)) - 1; //converts labels from 0/1 to -1/1
double exponential = std::exp ( -label * prediction (i, 0 ) );
double sigmoid = 1.0/(1.0+exponential);
gradient ( i,0 ) = -label * (1.0 - sigmoid);
error+=evalError(label,exponential,prediction (i, 0 ));
}
return error;
}else{
double error = 0;
for(std::size_t i = 0; i != prediction.size1(); ++i){
RANGE_CHECK ( target(i) < prediction.size2() );
auto gradRow=row(gradient,i);
//calculate the log norm in a numerically stable way
//we subtract the maximum prior to exponentiation to
//ensure that the exponentiation result will still fit in double
//this does not change the result as the values get normalized by
//their sum and thus the correction term cancels out.
double maximum = max(row(prediction,i));
noalias(gradRow) = exp(row(prediction,i) - maximum);
double norm = sum(gradRow);
gradRow/=norm;
gradient(i,target(i)) -= 1;
error+=std::log(norm) - prediction(i,target(i))+maximum;
}
return error;
}
}
double evalDerivative(ConstLabelReference target, ConstOutputReference prediction, OutputType& gradient) const {
gradient.resize(prediction.size());
if ( prediction.size() == 1 ){
RANGE_CHECK ( target < 2 );
double label = 2.0 * target - 1; //converts labels from 0/1 to -1/1
double exponential = std::exp ( - label * prediction(0));
double sigmoid = 1.0/(1.0+exponential);
gradient(0) = -label * (1.0 - sigmoid);
return evalError(label,exponential,prediction(0));
}else{
RANGE_CHECK ( target < prediction.size() );
//calculate the log norm in a numerically stable way
//we subtract the maximum prior to exponentiation to
//ensure that the exponentiation result will still fit in double
//this does not change the result as the values get normalized by
//their sum and thus the correction term cancels out.
double maximum = max(prediction);
noalias(gradient) = exp(prediction - maximum);
double norm = sum(gradient);
gradient /= norm;
gradient(target) -= 1;
return std::log(norm) - prediction(target) + maximum;
}
}
double evalDerivative(
ConstLabelReference target, ConstOutputReference prediction,
BatchOutputType& gradient,MatrixType & hessian
) const {
gradient.resize(prediction.size());
hessian.resize(prediction.size(),prediction.size());
if ( prediction.size() == 1 )
{
RANGE_CHECK ( target < 2 );
double label = 2 * static_cast(target) - 1; //converts labels from 0/1 to -1/1
double exponential = std::exp ( -label * prediction ( 0 ) );
double sigmoid = 1.0/(1.0+exponential);
gradient ( 0 ) = -label * (1.0-sigmoid);
hessian ( 0,0 ) = sigmoid * ( 1-sigmoid );
return evalError(label,exponential,prediction ( 0 ));
}
else
{
RANGE_CHECK ( target < prediction.size() );
//calculate the log norm in a numerically stable way
//we subtract the maximum prior to exponentiation to
//ensure that the exponentiation result will still fit in double
//this does not change the result as the values get normalized by
//their sum and thus the correction term cancels out.
double maximum = max(prediction);
noalias(gradient) = exp(prediction-maximum);
double norm = sum(gradient);
gradient/=norm;
noalias(hessian)=-outer_prod(gradient,gradient);
noalias(diag(hessian)) += gradient;
gradient(target) -= 1;
return std::log(norm) - prediction(target) - maximum;
}
}
};
template
class CrossEntropy, blas::vector >
: public AbstractLoss, blas::vector>
{
private:
typedef blas::vector OutputType;
typedef AbstractLoss base_type;
typedef typename base_type::ConstLabelReference ConstLabelReference;
typedef typename base_type::ConstOutputReference ConstOutputReference;
typedef typename base_type::BatchOutputType BatchOutputType;
typedef typename base_type::MatrixType MatrixType;
public:
CrossEntropy()
{ this->m_features |= base_type::HAS_FIRST_DERIVATIVE;}
/// \brief From INameable: return the class name.
std::string name() const
{ return "CrossEntropy"; }
// annoyingness of C++ templates
using base_type::eval;
double eval(BatchOutputType const& target, BatchOutputType const& prediction) const {
SIZE_CHECK(target.size1() == prediction.size1());
SIZE_CHECK(target.size2() == prediction.size2());
std::size_t m = target.size2();
OutputType maximum = max_columns(prediction);
auto safeExp = exp(prediction - trans(blas::repeat(maximum, m)));
OutputType norm = sum_columns(safeExp);
double error = sum(log(norm)) - sum(target * prediction) + sum(maximum);
return error;
}
double evalDerivative(BatchOutputType const& target, BatchOutputType const& prediction, BatchOutputType& gradient) const {
gradient.resize(prediction.size1(),prediction.size2());
std::size_t m = target.size2();
OutputType maximum = max_columns(prediction);
noalias(gradient) = exp(prediction - trans(blas::repeat(maximum, m)));
OutputType norm = sum_columns(gradient);
noalias(gradient) /= trans(blas::repeat(norm, m));
noalias(gradient) -= target;
double error = sum(log(norm)) - sum(target * prediction) + sum(maximum);
return error;
}
};
}
#endif