/*!
*
*
* \brief Kernel Target Alignment - a measure of alignment of a kernel Gram matrix with labels.
*
*
*
* \author T. Glasmachers, O.Krause
* \date 2010-2012
*
*
* \par Copyright 1995-2017 Shark Development Team
*
*
* This file is part of Shark.
*
*
* Shark is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Shark is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Shark. If not, see .
*
*/
#ifndef SHARK_OBJECTIVEFUNCTIONS_KERNELTARGETALIGNMENT_H
#define SHARK_OBJECTIVEFUNCTIONS_KERNELTARGETALIGNMENT_H
#include
#include
#include
#include
namespace shark{
/*!
* \brief Kernel Target Alignment - a measure of alignment of a kernel Gram matrix with labels.
*
* \par
* The Kernel Target Alignment (KTA) was originally proposed in the paper:
* On Kernel-Target Alignment. N. Cristianini, J. Shawe-Taylor,
* A. Elisseeff, J. Kandola. Innovations in Machine Learning, 2006.
* Here we provide a version with centering of the features as proposed
* in the paper:
* Two-Stage Learning Kernel Algorithms. C. Cortes, M. Mohri,
* A. Rostamizadeh. ICML 2010.
*
* \par
* The kernel target alignment is defined as
* \f[ \hat A = \frac{\langle K, y y^T \rangle}{\sqrt{\langle K, K \rangle \cdot \langle y y^T, y y^T \rangle}} \f]
* where K is the kernel Gram matrix of the data and y is the vector of
* +1/-1 valued labels. The outer product \f$ y y^T \f$ corresponds to
* an "ideal" Gram matrix corresponding to a kernel that maps
* the two classes each to a single point, thus minimizing within-class
* distance for fixed inter-class distance. The inner products denote the
* Frobenius product of matrices:
* http://en.wikipedia.org/wiki/Matrix_multiplication#Frobenius_product
*
* \par
* In kernel-based learning, the kernel Gram matrix K is of the form
* \f[ K_{i,j} = k(x_i, x_j) = \langle \phi(x_i), \phi(x_j) \rangle \f]
* for a Mercer kernel function k and inputs \f$ x_i, x_j \f$. In this
* version of the KTA we use centered feature vectors. Let
* \f[ \psi(x_i) = \phi(x_i) - \frac{1}{\ell} \sum_{j=1}^{\ell} \phi(x_j) \f]
* denote the centered feature vectors, then the centered Gram matrix
* \f$ K^c \f$ is given by
* \f[ K^c_{i,j} = \langle \psi(x_i), \psi(x_j) \rangle = K_{i,j} - \frac{1}{\ell} \sum_{n=1}^\ell K_{i,n} + K_{j,n} + \frac{1}{\ell^2} \sum_{m,n=1}^\ell K_{n,m} \f]
* The alignment measure computed by this class is the exact same formula
* for \f$ \hat A \f$, but with \f$ K^c \f$ plugged in in place of $\f$ K \f$.
*
* \par
* KTA measures the Frobenius inner product between a kernel Gram matrix
* and this ideal matrix. The interpretation is that KTA measures how
* well a given kernel fits a classification problem. The actual measure
* is invariant under kernel rescaling.
* In Shark, objective functions are minimized by convention. Therefore
* the negative alignment \f$ - \hat A \f$ is implemented. The measure is
* extended for multi-class problems by using prototype vectors instead
* of scalar labels.
*
* \par
* The following properties of KTA are important from a model selection
* point of view: it is relatively fast and easy to compute, it is
* differentiable w.r.t. the kernel function, and it is independent of
* the actual classifier.
*
* \par
* The following notation is used in several of the methods of the class.
* \f$ K^c \f$ denotes the centered Gram matrix, y is the vector of labels,
* Y is the outer product of this vector with itself, k is the row
* (or column) wise average of the uncentered Gram matrix K, my is the
* label average, and u is the vector of all ones, and \f$ \ell \f$ is the
* number of data points, and thus the size of the Gram matrix.
*/
template
class KernelTargetAlignment : public AbstractObjectiveFunction< RealVector, double >
{
private:
typedef typename Batch::type BatchLabelType;
public:
/// \brief Construction of the Kernel Target Alignment (KTA) from a kernel object.
KernelTargetAlignment(
LabeledData const& dataset,
AbstractKernelFunction* kernel,
bool centering = true
){
SHARK_RUNTIME_CHECK(kernel != NULL, "[KernelTargetAlignment] kernel must not be NULL");
mep_kernel = kernel;
m_features|=HAS_VALUE;
m_features|=CAN_PROPOSE_STARTING_POINT;
if(mep_kernel -> hasFirstParameterDerivative())
m_features|=HAS_FIRST_DERIVATIVE;
m_data = dataset;
m_elements = dataset.numberOfElements();
m_centering = centering;
setupY(dataset.labels(), centering);
}
/// \brief From INameable: return the class name.
std::string name() const
{ return "KernelTargetAlignment"; }
/// Return the current kernel parameters as a starting point for an optimization run.
SearchPointType proposeStartingPoint() const {
return mep_kernel -> parameterVector();
}
std::size_t numberOfVariables()const{
return mep_kernel->numberOfParameters();
}
/// \brief Evaluate the (centered, negative) Kernel Target Alignment (KTA).
///
/// See the class description for more details on this computation.
double eval(RealVector const& input) const{
mep_kernel->setParameterVector(input);
return -evaluateKernelMatrix().error;
}
/// \brief Compute the derivative of the KTA as a function of the kernel parameters.
///
/// It holds:
/// \f[ \langle K^c, K^c \rangle = \langle K,K \rangle -2 \ell \langle k,k \rangle + mk^2 \ell^2 \\
/// (\langle K^c, K^c \rangle )' = 2 \langle K,K' \rangle -4\ell \langle k, \frac{1}{\ell} \sum_j K'_ij \rangle +2 \ell^2 mk \sum_ij 1/(\ell^2) K'_ij \\
/// = 2 \langle K,K' \rangle -4 \langle k, \sum_j K'_ij \rangle + 2 mk \sum_ij K_ij \\
/// = 2 \langle K,K' \rangle -4 \langle k u^T, K' \rangle + 2 mk \langle u u^T, K' \rangle \\
/// = 2\langle K -2 k u^T + mk u u^T, K' \rangle ) \\
/// \langle Y, K^c \rangle = \langle Y, K \rangle - 2 n \langle y, k \rangle + n^2 my mk \\
/// (\langle Y , K^c \rangle )' = \langle Y -2 y u^T + my u u^T, K' \rangle \f]
/// now the derivative is computed from this values in a second sweep over the data:
/// we get:
/// \f[ \hat A' = 1/\langle K^c,K^c \rangle ^{3/2} (\langle K^c,K^c \rangle (\langle Y,K^c \rangle )' - 0.5*\langle Y, K^c \rangle (\langle K^c , K^c \rangle )') \\
/// = 1/\langle K^c,K^c \rangle ^{3/2} \langle \langle K^c,K^c \rangle (Y -2 y u^T + my u u^T)- \langle Y, K^c \rangle (K -2 k u^T+ mk u u^T),K' \rangle \\
/// = 1/\langle K^c,K^c \rangle ^{3/2} \langle W,K' \rangle \f]
///reordering rsults in
/// \f[ W= \langle K^c,K^c \rangle Y - \langle Y, K^c \rangle K \\
/// - 2 (\langle K^c,K^c \rangle y - \langle Y, K^c \rangle k) u^T \\
/// + (\langle K^c,K^c \rangle my - \langle Y, K^c \rangle mk) u u^T \f]
/// where \f$ K' \f$ is the derivative of K with respct of the kernel parameters.
ResultType evalDerivative( const SearchPointType & input, FirstOrderDerivative & derivative ) const {
mep_kernel->setParameterVector(input);
// the drivative is calculated in two sweeps of the data. first the required statistics
// \langle K^c,K^c \rangle , mk and k are evaluated exactly as in eval
KernelMatrixResults results = evaluateKernelMatrix();
std::size_t parameters = mep_kernel->numberOfParameters();
derivative.resize(parameters);
derivative.clear();
SHARK_PARALLEL_FOR(int i = 0; i < (int)m_data.numberOfBatches(); ++i){
std::size_t startX = 0;
for(int j = 0; j != i; ++j){
startX+= batchSize(m_data.batch(j));
}
RealVector threadDerivative(parameters,0.0);
RealVector blockDerivative;
boost::shared_ptr state = mep_kernel->createState();
RealMatrix blockK;//block of the KernelMatrix
RealMatrix blockW;//block of the WeightMatrix
std::size_t startY = 0;
for(int j = 0; j <= i; ++j){
mep_kernel->eval(m_data.batch(i).input,m_data.batch(j).input,blockK,*state);
mep_kernel->weightedParameterDerivative(
m_data.batch(i).input,m_data.batch(j).input,
generateDerivativeWeightBlock(i,j,startX,startY,blockK,results),//takes symmetry into account
*state,
blockDerivative
);
noalias(threadDerivative) += blockDerivative;
startY += batchSize(m_data.batch(j));
}
SHARK_CRITICAL_REGION{
noalias(derivative) += threadDerivative;
}
}
derivative *= -1;
derivative /= m_elements;
return -results.error;
}
private:
AbstractKernelFunction* mep_kernel; ///< kernel function
LabeledData m_data; ///< data points
RealVector m_columnMeanY; ///< mean label vector
double m_meanY; ///< mean label element
std::size_t m_numberOfClasses; ///< number of classes
std::size_t m_elements; ///< number of data points
bool m_centering;
struct KernelMatrixResults{
RealVector k;
double KcKc;
double YcKc;
double error;
double meanK;
};
void setupY(Dataconst& labels, bool centering){
//preprocess Y so calculate column means and overall mean
//the most efficient way to do this is via the class counts
std::vector classCount = classSizes(labels);
m_numberOfClasses = classCount.size();
RealVector classMean(m_numberOfClasses);
double dm1 = m_numberOfClasses-1.0;
m_meanY = 0;
for(std::size_t i = 0; i != m_numberOfClasses; ++i){
classMean(i) = classCount[i]-(m_elements-classCount[i])/dm1;
m_meanY += classCount[i] * classMean(i);
}
classMean /= m_elements;
m_meanY /= sqr(double(m_elements));
m_columnMeanY.resize(m_elements);
for(std::size_t i = 0; i != m_elements; ++i){
m_columnMeanY(i) = classMean(labels.element(i));
}
if(!centering){
m_meanY = 0;
m_columnMeanY.clear();
}
}
void setupY(Dataconst& labels, bool centering){
RealVector meanLabel = mean(labels);
m_columnMeanY.resize(m_elements);
for(std::size_t i = 0; i != m_elements; ++i){
m_columnMeanY(i) = inner_prod(labels.element(i),meanLabel);
}
m_meanY=inner_prod(meanLabel,meanLabel);
if(!centering){
m_meanY = 0;
m_columnMeanY.clear();
}
}
void computeBlockY(UIntVector const& labelsi,UIntVector const& labelsj, RealMatrix& blockY)const{
std::size_t blockSize1 = labelsi.size();
std::size_t blockSize2 = labelsj.size();
double dm1 = m_numberOfClasses-1.0;
for(std::size_t k = 0; k != blockSize1; ++k){
for(std::size_t l = 0; l != blockSize2; ++l){
if( labelsi(k) == labelsj(l))
blockY(k,l) = 1;
else
blockY(k,l) = -1.0/dm1;
}
}
}
void computeBlockY(RealMatrix const& labelsi,RealMatrix const& labelsj, RealMatrix& blockY)const{
noalias(blockY) = labelsi % trans(labelsj);
}
/// Update a sub-block of the matrix \f$ \langle Y, K^x \rangle \f$.
double updateYK(UIntVector const& labelsi,UIntVector const& labelsj, RealMatrix const& block)const{
std::size_t blockSize1 = labelsi.size();
std::size_t blockSize2 = labelsj.size();
//todo optimize the i=j case
double result = 0;
double dm1 = m_numberOfClasses-1.0;
for(std::size_t k = 0; k != blockSize1; ++k){
for(std::size_t l = 0; l != blockSize2; ++l){
if(labelsi(k) == labelsj(l))
result += block(k,l);
else
result -= block(k,l)/dm1;
}
}
return result;
}
/// Update a sub-block of the matrix \f$ \langle Y, K^x \rangle \f$.
double updateYK(RealMatrix const& labelsi,RealMatrix const& labelsj, RealMatrix const& block)const{
RealMatrix Y(labelsi.size1(), labelsj.size1());
computeBlockY(labelsi,labelsj,Y);
return sum(Y * block);
}
/// Compute a sub-block of the matrix
/// \f[ W = \langle K^c, K^c \rangle Y - \langle Y, K^c \rangle K -2 (\langle K^c, K^c \rangle y - \langle Y, K^c \rangle k) u^T + (\langle K^c, K^c \rangle my - \langle Y, K^c \rangle mk) u u^T \f]
RealMatrix generateDerivativeWeightBlock(
std::size_t i, std::size_t j,
std::size_t start1, std::size_t start2,
RealMatrix const& blockK,
KernelMatrixResults const& matrixStatistics
)const{
std::size_t blockSize1 = batchSize(m_data.batch(i));
std::size_t blockSize2 = batchSize(m_data.batch(j));
//double n = m_elements;
double KcKc = matrixStatistics.KcKc;
double YcKc = matrixStatistics.YcKc;
double meanK = matrixStatistics.meanK;
RealMatrix blockW(blockSize1,blockSize2);
//first calculate \langle Kc,Kc \rangle Y.
computeBlockY(m_data.batch(i).label,m_data.batch(j).label,blockW);
blockW *= KcKc;
//- \langle Y,K^c \rangle K
blockW-=YcKc*blockK;
// -2(\langle Kc,Kc \rangle y -\langle Y, K^c \rangle k) u^T
// implmented as: -(\langle K^c,K^c \rangle y -2\langle Y, K^c \rangle k) u^T -u^T(\langle K^c,K^c \rangle y -2\langle Y, K^c \rangle k)^T
//todo find out why this is correct and the calculation above is not.
blockW-=repeat(subrange(KcKc*m_columnMeanY - YcKc*matrixStatistics.k,start2,start2+blockSize2),blockSize1);
blockW-=trans(repeat(subrange(KcKc*m_columnMeanY - YcKc*matrixStatistics.k,start1,start1+blockSize1),blockSize2));
// + (\langle Kc,Kc \rangle my-2\langle Y, Kc \rangle mk) u u^T
blockW+= KcKc*m_meanY-YcKc*meanK;
blockW /= KcKc*std::sqrt(KcKc);
//symmetry
if(i != j)
blockW *= 2.0;
return blockW;
}
/// \brief Evaluate the centered kernel Gram matrix.
///
/// The computation is as follows. By means of a
/// number of identities it holds
/// \f[ \langle K^c, K^c \rangle = \\
/// \langle K^c, K^c \rangle = \langle K,K \rangle -2n\langle k,k \rangle +mk^2n^2 \\
/// \langle K^c, Y \rangle = \langle K, Y \rangle - 2 n \langle k, y \rangle + n^2 mk my \f]
/// where k is the row mean over K and y the row mean over y, mk, my the total means of K and Y
/// and n the number of elements
KernelMatrixResults evaluateKernelMatrix()const{
//it holds
// \langle K^c,K^c \rangle = \langle K,K \rangle -2n\langle k,k \rangle +mk^2n^2
// \langle K^c,Y \rangle = \langle K, Y \rangle - 2 n \langle k, y \rangle + n^2 mk my
// where k is the row mean over K and y the row mean over y, mk, my the total means of K and Y
// and n the number of elements
double KK = 0; //stores \langle K,K \rangle
double YK = 0; //stores \langle Y,K^c \rangle
RealVector k(m_elements,0.0);//stores the row/column means of K
SHARK_PARALLEL_FOR(int i = 0; i < (int)m_data.numberOfBatches(); ++i){
std::size_t startRow = 0;
for(int j = 0; j != i; ++j){
startRow+= batchSize(m_data.batch(j));
}
std::size_t rowSize = batchSize(m_data.batch(i));
double threadKK = 0;
double threadYK = 0;
RealVector threadk(m_elements,0.0);
std::size_t startColumn = 0; //starting column of the current block
for(int j = 0; j <= i; ++j){
std::size_t columnSize = batchSize(m_data.batch(j));
RealMatrix blockK = (*mep_kernel)(m_data.batch(i).input,m_data.batch(j).input);
if(i == j){
threadKK += frobenius_prod(blockK,blockK);
subrange(threadk,startColumn,startColumn+columnSize)+=sum_rows(blockK);//update sum_rows(K)
threadYK += updateYK(m_data.batch(i).label,m_data.batch(j).label,blockK);
}
else{//use symmetry ok K
threadKK += 2.0 * frobenius_prod(blockK,blockK);
subrange(threadk,startColumn,startColumn+columnSize)+=sum_rows(blockK);
subrange(threadk,startRow,startRow+rowSize)+=sum_columns(blockK);//symmetry: block(j,i)
threadYK += 2.0 * updateYK(m_data.batch(i).label,m_data.batch(j).label,blockK);
}
startColumn+=columnSize;
}
SHARK_CRITICAL_REGION{
KK += threadKK;
YK +=threadYK;
noalias(k) +=threadk;
}
}
//calculate the error
double n = (double)m_elements;
k /= n;//means
double meanK = sum(k)/n;
if(!m_centering){
k.clear();
meanK = 0;
}
double n2 = sqr(n);
double YcKc = YK-2.0*n*inner_prod(k,m_columnMeanY)+n2*m_meanY*meanK;
double KcKc = KK - 2.0*n*inner_prod(k,k)+n2*sqr(meanK);
KernelMatrixResults results;
results.k=k;
results.YcKc = YcKc;
results.KcKc = KcKc;
results.meanK = meanK;
results.error = YcKc/std::sqrt(KcKc)/n;
return results;
}
};
}
#endif