//===========================================================================
/*!
*
*
* \brief export precomputed kernel matrices (using libsvm format)
*
*
*
* \author M. Tuma
* \date 2012
*
*
* \par Copyright 1995-2017 Shark Development Team
*
*
* This file is part of Shark.
*
*
* Shark is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Shark is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Shark. If not, see .
*
*/
//===========================================================================
#ifndef SHARK_DATA_PRECOMPUTEDMATRIX_H
#define SHARK_DATA_PRECOMPUTEDMATRIX_H
#include
#include
#include
#include
#include
#include
namespace shark
{
/**
* \ingroup shark_globals
*
* @{
*/
enum KernelMatrixNormalizationType
{
NONE, // no normalization. output regular Gram kernel matrix
MULTIPLICATIVE_TRACE_ONE, // determine the trace, and devide each entry by it
MULTIPLICATIVE_TRACE_N, // determine the trace, devide each entry by it, then multiply by the number of samples
MULTIPLICATIVE_VARIANCE_ONE, // normalize to unit variance in feature space. see kloft in jmlr 2012.
CENTER_ONLY, // center the kernel in feature space. see cortes in jmlr 2012 and in icml 2010.
CENTER_AND_MULTIPLICATIVE_TRACE_ONE // first center the kernel in featrue space. then devide each entry by the centered kernel's trace.
};
/// \brief Write a kernel Gram matrix to stream.
///
/// \param dataset data basis for the Gram matrix
/// \param kernel pointer to kernel function to be used
/// \param out The stream to be written to
/// \param normalizer what kind of normalization to apply. see enum declaration for details.
/// \param scientific should the output be in scientific notation?
/// \param fieldwidth field width for pretty printing
template
void exportKernelMatrix(
LabeledData const &dataset,
AbstractKernelFunction &kernel, // kernel function (can't be const b/c of ScaledKernel later)
std::ostream &out, // The stream to be written to
KernelMatrixNormalizationType normalizer = NONE, // what kind of normalization to apply. see enum declaration for details.
bool scientific = false, // scientific notation?
unsigned int fieldwidth = 0 // for pretty-printing
)
{
//get access to the range of elements
DataView const> points(dataset);
std::size_t size = points.size();
SIZE_CHECK(size != 0);
// check outstream status
if(!out)
{
throw(std::invalid_argument("[export_kernel_matrix] Can't write to stream."));
}
// COMPUTE MODIFIERS
// if multiplicative trace normalization: determine trace
double trace = 0.0;
double trace_factor = 1.0;
if(normalizer == MULTIPLICATIVE_TRACE_ONE || normalizer == MULTIPLICATIVE_TRACE_N)
{
for(auto point: points)
{
trace += kernel.eval(point.input, point.input);
}
SHARK_ASSERT(trace > 0);
trace_factor = 1.0 / trace;
if(normalizer == MULTIPLICATIVE_TRACE_N)
{
trace_factor *= size;
}
}
// if multiplicative variance normalization: determine factor
double variance_factor = 0.0;
if(normalizer == MULTIPLICATIVE_VARIANCE_ONE)
{
ScaledKernel scaled(&kernel);
NormalizeKernelUnitVariance normalizer;
normalizer.train(scaled, dataset.inputs());
variance_factor = scaled.factor();
}
// if centering: determine matrix- and row-wise means;
double mean = 0;
RealVector rowmeans(size, 0.0);
if(normalizer == CENTER_ONLY || normalizer == CENTER_AND_MULTIPLICATIVE_TRACE_ONE)
{
// initialization: calculate mean and rowmeans
for(std::size_t i = 0; i < size; i++)
{
double k = kernel.eval(points[i].input, points[i].input);
mean += k; //add diagonal value to mean once
rowmeans(i) += k; //add diagonal to its rowmean
for(std::size_t j = 0; j < i; j++)
{
double k = kernel.eval(points[i].input, points[j].input);
mean += 2.0 * k; //add off-diagonals to mean twice
rowmeans(i) += k; //add to mean of row
rowmeans(j) += k; //add to mean of transposed row
}
}
mean = mean / (double) size / (double) size;
rowmeans /= size;
// get trace if necessary
if(normalizer == CENTER_AND_MULTIPLICATIVE_TRACE_ONE)
{
trace = 0.0;
for(std::size_t i = 0; i < size; i++)
{
trace += kernel.eval(points[i].input, points[i].input) - 2 * rowmeans(i) + mean;
}
SHARK_ASSERT(trace > 0);
trace_factor = 1.0 / trace;
}
}
// FIX OUTPUT FORMAT
// set output format
if(scientific)
out.setf(std::ios_base::scientific);
std::streamsize ss = out.precision();
out.precision(10);
// determine dataset type
double max_label = -1e100;
double min_label = -max_label;
bool binary = false;
bool regression = false;
for(double cur_label: dataset.labels().elements())
{
if(cur_label > max_label)
max_label = cur_label;
if(cur_label < min_label)
min_label = cur_label;
if((cur_label != (int)cur_label) || cur_label < 0)
regression = true;
}
if(!regression && (min_label == 0) && (max_label == 1))
binary = true;
// WRITE OUT
// write to file:
// loop through examples (rows)
for(std::size_t i = 0; i < size; i++)
{
// write label
if(regression)
{
out << std::setw(fieldwidth) << std::left << points[i].label << " ";
}
else if(binary)
{
out << std::setw(fieldwidth) << std::left << (int)(points[i].label * 2 - 1) << " ";
}
else
{
out << std::setw(fieldwidth) << std::left << (unsigned int)(points[i].label + 1) << " ";
}
out << "0:" << std::setw(fieldwidth) << std::left << i + 1; //write index
// loop through examples (columns)
// CASE DISTINCTION:
if(normalizer == NONE)
{
for(std::size_t j = 0; j < size; j++)
{
out << " " << j + 1 << ":" << std::setw(fieldwidth) << std::left << kernel.eval(points[i].input, points[j].input);
}
out << "\n";
}
else if(normalizer == MULTIPLICATIVE_TRACE_ONE || normalizer == MULTIPLICATIVE_TRACE_N)
{
for(std::size_t j = 0; j < size; j++)
{
out << " " << j + 1 << ":" << std::setw(fieldwidth) << std::left << trace_factor * kernel.eval(points[i].input, points[j].input);
}
out << "\n";
}
else if(normalizer == MULTIPLICATIVE_VARIANCE_ONE)
{
for(std::size_t j = 0; j < size; j++)
{
out << " " << j + 1 << ":" << std::setw(fieldwidth) << std::left << variance_factor *kernel.eval(points[i].input, points[j].input);
}
out << "\n";
}
else if(normalizer == CENTER_ONLY)
{
for(std::size_t j = 0; j < size; j++)
{
double tmp = kernel.eval(points[i].input, points[j].input) - rowmeans(i) - rowmeans(j) + mean;
out << " " << j + 1 << ":" << std::setw(fieldwidth) << std::left << tmp;
}
out << "\n";
}
else if(normalizer == CENTER_AND_MULTIPLICATIVE_TRACE_ONE)
{
for(std::size_t j = 0; j < size; j++)
{
double tmp = kernel.eval(points[i].input, points[j].input) - rowmeans(i) - rowmeans(j) + mean;
out << " " << j + 1 << ":" << std::setw(fieldwidth) << std::left << trace_factor *tmp;
}
out << "\n";
}
else
{
throw SHARKEXCEPTION("[detail::export_kernel_matrix] Unknown normalization type.");
}
}
// clean up
out.precision(ss);
}
/// \brief Write a kernel Gram matrix to file.
///
/// \param dataset data basis for the Gram matrix
/// \param kernel pointer to kernel function to be used
/// \param fn The filename of the file to be written to
/// \param normalizer what kind of normalization to apply. see enum declaration for details.
/// \param sci should the output be in scientific notation?
/// \param width field width for pretty printing
template
void exportKernelMatrix(
LabeledData const &dataset,
AbstractKernelFunction &kernel,
std::string fn,
KernelMatrixNormalizationType normalizer = NONE,
bool sci = false,
unsigned int width = 0
)
{
std::ofstream ofs(fn.c_str());
if(ofs)
{
exportKernelMatrix(dataset, kernel, ofs, normalizer, sci, width);
}
else
throw(std::invalid_argument("[detail::export_kernel_matrix] Stream cannot be opened for writing."));
}
// deprecated wrapper
template
void export_kernel_matrix(
LabeledData const &dataset,
AbstractKernelFunction &kernel, // kernel function (can't be const b/c of ScaledKernel later)
std::ostream &out, // The stream to be written to
KernelMatrixNormalizationType normalizer = NONE, // what kind of normalization to apply. see enum declaration for details.
bool scientific = false, // scientific notation?
unsigned int fieldwidth = 0 // for pretty-printing
)
{
exportKernelMatrix(dataset, kernel, out, normalizer, scientific, fieldwidth);
}
// deprecated wrapper
template
void export_kernel_matrix(
LabeledData const &dataset,
AbstractKernelFunction &kernel,
std::string fn,
KernelMatrixNormalizationType normalizer = NONE,
bool sci = false,
unsigned int width = 0
)
{
exportKernelMatrix(dataset, kernel, fn, normalizer, sci, width);
}
// TODO: import functionality is still missing.
// when that is done, add tutorial
/** @}*/
} // namespace shark
#endif // SHARK_DATA_PRECOMPUTEDMATRIX_H