/*! * \brief Implements the statistics module of shark datasets * * \author O. Krause * \date 2015 * * * \par Copyright 1995-2017 Shark Development Team * *

* This file is part of Shark. * * * Shark is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Shark is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with Shark. If not, see . * */ namespace shark{ /*! * \brief Calculates the mean and variance values of a dataset * * Given the vector of data, the mean and variance values * are calculated as in the functions #mean and #variance. * * \param data Input data. * \param meanVec Vector of mean values. * \param varianceVec Vector of variances. * */ template void meanvar ( Data const& data, blas::vector_container& meanVec, blas::vector_container& varianceVec ) { SIZE_CHECK(!data.empty()); std::size_t const dataSize = data.numberOfElements(); std::size_t elementSize=dataDimension(data); varianceVec().resize(elementSize); varianceVec().clear(); meanVec()= mean(data); //sum of variances of each column for(auto& batch: data.batches()){ std::size_t batchSize = batch.size1(); noalias(varianceVec()) += sum_rows(sqr(batch-repeat(meanVec,batchSize))); } varianceVec() /= double(dataSize); } /*! * \brief Calculates the mean and covariance values of a set of data * * Given the vector of data, the mean and variance values * are calculated as in the functions #mean and #variance. * * \param data Input data. * \param meanVec Vector of mean values. * \param covariance Covariance matrix. * */ template void meanvar ( Data const& data, blas::vector_container& meanVec, blas::matrix_container& covariance ){ SIZE_CHECK(!data.empty()); typedef typename Batch::type BatchType; std::size_t const dataSize = data.numberOfElements(); std::size_t elementSize=dataDimension(data); covariance().resize(elementSize,elementSize); covariance().clear(); meanVec() = mean(data); //sum of variances of each column for(std::size_t b = 0; b != data.numberOfBatches(); ++b){ //make the batch mean-free BatchType batch = data.batch(b)-repeat(meanVec,data.batch(b).size1()); noalias(covariance) += prod(trans(batch),batch); } covariance() /= double(dataSize); } /*! * \brief Calculates the mean vector of array "x". * * Given a \em d -dimensional array \em x with size \em N1 x ... x \em Nd, * this function calculates the mean vector given as: * \f[ * mean_j = \frac{1}{N1} \sum_{i=1}^{N1} x_{i,j} * \f] * Example: * \f[ * \left( * \begin{array}{*{4}{c}} * 1 & 2 & 3 & 4\\ * 5 & 6 & 7 & 8\\ * 9 & 10 & 11 & 12\\ * \end{array} * \right) * \longrightarrow * \frac{1}{3} * \left( * \begin{array}{*{4}{c}} * 1+5+9 & 2+6+10 & 3+7+11 & 4+8+12\\ * \end{array} * \right) * \longrightarrow * \left( * \begin{array}{*{4}{c}} * 5 & 6 & 7 & 8\\ * \end{array} * \right) * \f] * * \param data input data, from which the * mean value will be calculated * \return the mean vector of \em x */ template VectorType mean(Data const& data){ SIZE_CHECK(!data.empty()); VectorType mean(dataDimension(data),0.0); for(auto& batch: data.batches()){ mean += sum_rows(batch); } mean /= double(data.numberOfElements()); return mean; } /*! * \brief Calculates the variance vector of array "x". * * Given a \em d -dimensional array \em x with size \em N1 x ... x \em Nd * and mean value vector \em m, * this function calculates the variance vector given as: * \f[ * variance = \frac{1}{N1} \sum_{i=1}^{N1} (x_i - m_i)^2 * \f] * * \param data input data from which the variance will be calculated * \return the variance vector of \em x */ template VectorType variance(Data const& data) { RealVector m; // vector of mean values. RealVector v; // vector of variance values meanvar(data,m,v); return v; } /*! * \brief Calculates the covariance matrix of the data vectors stored in * data. * * Given a Set \f$X = (x_{ij})\f$ of \f$n\f$ vectors with length \f$N\f$, * the function calculates the covariance matrix given as * * \f$ * Cov = (c_{kl}) \mbox{,\ } c_{kl} = \frac{1}{n - 1} \sum_{i=1}^n * (x_{ik} - \overline{x_k})(x_{il} - \overline{x_l})\mbox{,\ } * k,l = 1, \dots, N * \f$ * * where \f$\overline{x_j} = \frac{1}{n} \sum_{i = 1}^n x_{ij}\f$ is the * mean value of \f$x_j \mbox{,\ }j = 1, \dots, N\f$. * * \param data The \f$n \times N\f$ input matrix. * \return \f$N \times N\f$ matrix of covariance values. */ template blas::matrix covariance(Data const& data) { RealVector mean; RealMatrix covariance; meanvar(data,mean,covariance); return covariance; } }