//=========================================================================== /*! * * * \brief The k-means clustering algorithm. * * * * \author T. Glasmachers * \date 2011 * * * \par Copyright 1995-2017 Shark Development Team * *

* This file is part of Shark. * * * Shark is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Shark is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with Shark. If not, see . * */ //=========================================================================== #ifndef SHARK_ALGORITHMS_KMEANS_H #define SHARK_ALGORITHMS_KMEANS_H #include #include #include #include #include #include namespace shark{ /// /// \brief The k-means clustering algorithm. /// /// \par /// The k-means algorithm takes vector-valued data /// \f$ \{x_1, \dots, x_n\} \subset \mathbb R^d \f$ /// and splits it into k clusters, based on centroids /// \f$ \{c_1, \dots, c_k\} \f$. /// The result is stored in a Centroids object that can be used to /// construct clustering models. /// /// \par /// This implementation starts the search with the given centroids, /// in case the provided centroids object (third parameter) contains /// a set of k centroids. Otherwise the search starts from the first /// k data points. /// /// \par /// Note that the data set needs to include at least k data points /// for k-means to work. This is because the current implementation /// does not allow for empty clusters. /// /// \param data vector-valued data to be clustered /// \param k number of clusters /// \param centroids centroids input/output /// \param maxIterations maximum number of k-means iterations; 0: unlimited /// \return number of k-means iterations /// SHARK_EXPORT_SYMBOL std::size_t kMeans(Data const& data, std::size_t k, Centroids& centroids, std::size_t maxIterations = 0); /// /// \brief The k-means clustering algorithm for initializing an RBF Layer /// /// \par /// The k-means algorithm takes vector-valued data /// \f$ \{x_1, \dots, x_n\} \subset \mathbb R^d \f$ /// and splits it into k clusters, based on centroids /// \f$ \{c_1, \dots, c_k\} \f$. /// The result is stored in a RBFLayer object that can be used to /// construct clustering models. /// /// \par /// This is just an alternative frontend to the version using Centroids. it creates a centroid object, /// with as many clusters as are outputs in the RBFLayer and copies the result into the model. /// /// \par /// Note that the data set needs to include at least k data points /// for k-means to work. This is because the current implementation /// does not allow for empty clusters. /// /// \param data vector-valued data to be clustered /// \param model RBFLayer input/output /// \param maxIterations maximum number of k-means iterations; 0: unlimited /// \return number of k-means iterations /// SHARK_EXPORT_SYMBOL std::size_t kMeans(Data const& data, RBFLayer& model, std::size_t maxIterations = 0); /// /// \brief The kernel k-means clustering algorithm /// /// \par /// The kernel k-means algorithm takes data /// \f$ \{x_1, \dots, x_n\} \f$ /// and splits it into k clusters, based on centroids /// \f$ \{c_1, \dots, c_k\} \f$. /// The centroids are elements of the reproducing kernel Hilbert space /// (RHKS) induced by the kernel function. They are functions, represented /// as the components of a KernelExpansion object. I.e., given a data point /// x, the kernel expansion returns a k-dimensional vector f(x), which is /// the evaluation of the centroid functions on x. The value of the /// centroid function represents the inner product of the centroid with /// the kernel-induced feature vector of x (embedding of x into the RKHS). /// The distance of x from the centroid \f$ c_i \f$ is computes as the /// kernel-induced distance /// \f$ \sqrt{ kernel(x, x) + kernel(c_i, c_i) - 2 kernel(x, c_i) } \f$. /// For the Gaussian kernel (and other normalized kernels) is simplifies to /// \f$ \sqrt{ 2 - 2 kernel(x, c_i) } \f$. Hence, larger function values /// indicate smaller distance to the centroid. /// /// \par /// Note that the data set needs to include at least k data points /// for k-means to work. This is because the current implementation /// does not allow for empty clusters. /// /// \param dataset vector-valued data to be clustered /// \param k number of clusters /// \param kernel kernel function object /// \param maxIterations maximum number of k-means iterations; 0: unlimited /// \return centroids (represented as functions, see description) /// template KernelExpansion kMeans(Data const& dataset, std::size_t k, AbstractKernelFunction& kernel, std::size_t maxIterations = 0){ if(!maxIterations) maxIterations = std::numeric_limits::max(); std::size_t n = dataset.numberOfElements(); RealMatrix kernelMatrix = calculateRegularizedKernelMatrix(kernel,dataset,0); UIntVector clusterMembership(n); UIntVector clusterSizes(k,0); RealVector ckck(k,0); //init cluster assignments for(unsigned int i = 0; i != n; ++i){ clusterMembership(i) = i % k; } std::shuffle(clusterMembership.begin(),clusterMembership.end(),random::globalRng); for(std::size_t i = 0; i != n; ++i){ ++clusterSizes(clusterMembership(i)); } // k-means loop std::size_t iter = 0; bool equal = false; for(; iter != maxIterations && !equal; ++iter) { //the clustering model results in centers c_k= 1/n_k sum_i k(x_i,.) for all x_i points of cluster k //we need to compute the squared distances between all centers and points that is //d^2(c_k,x_i) = -2 < c_k,x_i> + for the i-th point. //thus we precompute = sum_ij k(x_i,x_j)/(n_k)^2 for all x_i,x_j points of cluster k ckck.clear(); for(std::size_t i = 0; i != n; ++i){ std::size_t c1 = clusterMembership(i); for(std::size_t j = 0; j != n; ++j){ std::size_t c2 = clusterMembership(j); if(c1 != c2) continue; ckck(c1) += kernelMatrix(i,j); } } noalias(ckck) = safe_div(ckck,sqr(clusterSizes),0); UIntVector newClusterMembership(n); RealVector currentDistances(k); for(std::size_t i = 0; i != n; ++i){ //compute squared distances between the i-th point and the centers //we skip as it is always the same for all elements and we don't need it for comparison noalias(currentDistances) = ckck; for(std::size_t j = 0; j != n; ++j){ std::size_t c = clusterMembership(j); currentDistances(c) -= 2* kernelMatrix(i,j)/clusterSizes(c); } //choose the index with the smallest distance as new cluster newClusterMembership(i) = (unsigned int) arg_min(currentDistances); } equal = boost::equal( newClusterMembership, clusterMembership ); noalias(clusterMembership) = newClusterMembership; //compute new sizes of clusters clusterSizes.clear(); for(std::size_t i = 0; i != n; ++i){ ++clusterSizes(clusterMembership(i)); } //if a cluster is empty then assign a random point to it for(unsigned int i = 0; i != k; ++i){ if(clusterSizes(i) == 0){ std::size_t elem = random::uni(random::globalRng, std::size_t(0), n-1); --clusterSizes(clusterMembership(elem)); clusterMembership(elem)=i; clusterSizes(i) = 1; } } } //copy result in the expansion KernelExpansion expansion; expansion.setStructure(&kernel,dataset,true,k); expansion.offset() = -ckck; expansion.alpha().clear(); for(std::size_t i = 0; i != n; ++i){ std::size_t c = clusterMembership(i); expansion.alpha()(i,c) = 2.0 / clusterSizes(c); } return expansion; } } // namespace shark #endif