* This file is part of Shark. * * * Shark is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Shark is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with Shark. If not, see . * */ #ifndef REMORA_DECOMPOSITIONS_HPP #define REMORA_DECOMPOSITIONS_HPP #include "kernels/trsm.hpp" #include "kernels/trsv.hpp" #include "kernels/potrf.hpp" #include "kernels/pstrf.hpp" #include "kernels/getrf.hpp" #include "kernels/syev.hpp" #include "assignment.hpp" #include "permutation.hpp" #include "matrix_expression.hpp" #include "proxy_expressions.hpp" #include "vector_expression.hpp" namespace remora{ template struct solver_expression{ typedef Device device_type; D const& operator()() const { return *static_cast(this); } D& operator()() { return *static_cast(this); } }; /// \brief Lower triangular Cholesky decomposition. /// /// Given an \f$ m \times m \f$ symmetric positive definite matrix /// \f$A\f$, represents the lower triangular matrix \f$L\f$ such that /// \f$A = LL^T \f$. /// /// This decomposition is a corner block of many linear algebra routines /// especially for solving symmetric positive definite systems of equations. /// /// Unlike many other decompositions, this decomposition is fast, /// numerically stable and can be updated when the original matrix is changed /// (rank-k updates of the form A<-alpha A + VCV^T) template class cholesky_decomposition: public solver_expression< cholesky_decomposition, typename MatrixStorage::device_type >{ private: typedef typename MatrixStorage::value_type value_type; public: typedef typename MatrixStorage::device_type device_type; cholesky_decomposition(){} template cholesky_decomposition(matrix_expression const& e):m_cholesky(e){ kernels::potrf(m_cholesky); } template void decompose(matrix_expression const& e){ m_cholesky.resize(e().size1(), e().size2()); noalias(m_cholesky) = e; kernels::potrf(m_cholesky); } MatrixStorage const& lower_factor()const{ return m_cholesky; } auto upper_factor()const -> decltype(trans(std::declval())){ return trans(m_cholesky); } template void solve(matrix_expression& B, left)const{ kernels::trsm(lower_factor(),B); kernels::trsm(upper_factor(),B); } template void solve(matrix_expression& B, right)const{ kernels::trsm(upper_factor(),B); kernels::trsm(lower_factor(),B); } template void solve(vector_expression&b, system_tag)const{ kernels::trsv(lower_factor(),b); kernels::trsv(upper_factor(),b); } /// \brief Updates a covariance factor by a rank one update /// /// Let \f$ A=LL^T \f$ be a matrix with its lower cholesky factor. Assume we want to update /// A using a simple rank-one update \f$ A = \alpha A+ \beta vv^T \f$. This invalidates L and /// it needs to be recomputed which is O(n^3). instead we can update the factorisation /// directly by performing a similar, albeit more complex algorithm on L, which can be done /// in O(L^2). /// /// Alpha is not required to be positive, but if it is not negative, one has to be carefull /// that the update would keep A positive definite. Otherwise the decomposition does not /// exist anymore and an exception is thrown. /// /// \param v the update vector /// \param alpha the scaling factor, must be positive. /// \param beta the update factor. it Can be positive or negative template void update(value_type alpha, value_type beta, vector_expression const& v){ if(beta == 0){ m_cholesky *= std::sqrt(alpha); return; } //implementation blatantly stolen from Eigen std::size_t n = v().size(); auto& L = m_cholesky; typename vector_temporary::type temp = v; double beta_prime = 1; double a = std::sqrt(alpha); for(std::size_t j=0; j != n; ++j) { double Ljj = a * L(j,j); double dj = Ljj * Ljj; double wj = temp(j); double swj2 = beta * wj * wj; double gamma = dj * beta_prime + swj2; double x = dj + swj2/beta_prime; if (x <= 0.0) throw std::invalid_argument("[cholesky_decomposition::update] update makes matrix indefinite, no update available"); double nLjj = std::sqrt(x); L(j,j) = nLjj; beta_prime += swj2/dj; // Update the terms of L if(j+1 void serialize( Archive & ar, const std::size_t version ) { ar & m_cholesky; } private: MatrixStorage m_cholesky; }; /// \brief Symmetric eigenvalue decomposition A=QDQ^T /// /// every symmetric matrix can be decomposed into its eigenvalue decomposition /// A=QDQ^T, where Q is an orthogonal matrix with Q^TQ=QQ^T=I /// and D is the diagonal matrix of eigenvalues of A. template class symm_eigenvalue_decomposition: public solver_expression< symm_eigenvalue_decomposition, typename MatrixStorage::device_type >{ private: typedef typename MatrixStorage::value_type value_type; typedef typename vector_temporary::type VectorStorage; public: typedef typename MatrixStorage::device_type device_type; symm_eigenvalue_decomposition(){} template symm_eigenvalue_decomposition(matrix_expression const& e){ decompose(e); } template void decompose(matrix_expression const& e){ REMORA_SIZE_CHECK(e().size1() == e().size2()); m_eigenvectors.resize(e().size1(),e().size1()); m_eigenvalues.resize(e().size1()); noalias(m_eigenvectors) = e; kernels::syev(m_eigenvectors,m_eigenvalues); } MatrixStorage const& Q()const{ return m_eigenvectors; } VectorStorage const& D()const{ return m_eigenvalues; } template void solve(matrix_expression& B, left)const{ B() = Q() % to_diagonal(elem_inv(D()))% trans(Q()) % B; } template void solve(matrix_expression& B, right)const{ auto transB = trans(B); solve(transB,left()); } template void solve(vector_expression&b, left)const{ b() = Q() % safe_div(trans(Q()) % b,D() ,0.0); } template void solve(vector_expression&b, right)const{ solve(b,left()); } template void serialize( Archive & ar, const std::size_t version ) { ar & m_eigenvectors; ar & m_eigenvalues; } private: MatrixStorage m_eigenvectors; VectorStorage m_eigenvalues; }; template class pivoting_lu_decomposition: public solver_expression< pivoting_lu_decomposition, typename MatrixStorage::device_type >{ public: typedef typename MatrixStorage::device_type device_type; template pivoting_lu_decomposition(matrix_expression const& e) :m_factor(e), m_permutation(e().size1()){ kernels::getrf(m_factor,m_permutation); } MatrixStorage const& factor()const{ return m_factor; } permutation_matrix const& permutation() const{ return m_permutation; } template void solve(matrix_expression& B, left)const{ swap_rows(m_permutation,B); kernels::trsm(m_factor,B); kernels::trsm(m_factor,B); } template void solve(matrix_expression& B, right)const{ kernels::trsm(m_factor,B); kernels::trsm(m_factor,B); swap_columns_inverted(m_permutation,B); } template void solve(vector_expression& b, left)const{ swap_rows(m_permutation,b); kernels::trsv(m_factor,b); kernels::trsv(m_factor,b); } template void solve(vector_expression& b, right)const{ kernels::trsv(m_factor,b); kernels::trsv(m_factor,b); swap_rows_inverted(m_permutation,b); } private: MatrixStorage m_factor; permutation_matrix m_permutation; }; // This is an implementation suggested by // "Fast Computation of Moore-Penrose Inverse Matrices" // applied to the special case of symmetric pos semi-def matrices // trading numerical accuracy vs speed. We go for speed. // // The fact that A is not full rank means it is not invertable, // so we solve it in a least squares sense. // // We use the formula for the pseudo-inverse: // (P^T A P)^-1 = L(L^TL)^-1(L^TL)^-1 L^T // where L is a matrix obtained by some rank revealing factorization // P^T A P = L L^T // we chose a pivoting cholesky to make use of the fact that A is symmetric // and all eigenvalues are >=0. If A has full rank, this reduces to // the cholesky factor where the pivoting leads to slightly smaller numerical errors // At a higher computational cost compared to the normal cholesky decomposition. template class symm_pos_semi_definite_solver: public solver_expression, typename MatrixStorage::device_type>{ public: typedef typename MatrixStorage::device_type device_type; template symm_pos_semi_definite_solver(matrix_expression const& e) :m_factor(e), m_permutation(e().size1()){ m_rank = kernels::pstrf(m_factor,m_permutation); if(m_rank == e().size1()) return; //full rank, so m_factor is lower triangular and we are done auto L = columns(m_factor,0,m_rank); m_cholesky.decompose(prod(trans(L),L)); } std::size_t rank()const{ return m_rank; } //compute C so that A^dagger = CC^T //where A^dagger is the moore-penrose inverse // m must be of size rank x n template void compute_inverse_factor(matrix_expression& C)const{ REMORA_SIZE_CHECK(C().size1() == m_rank); REMORA_SIZE_CHECK(C().size2() == m_factor.size1()); if(m_rank == m_factor.size1()){//matrix has full rank //initialize as identity matrix and solve noalias(C) = identity_matrix( m_factor.size1()); swap_columns_inverted(m_permutation,C); kernels::trsm(m_factor,C); }else{ auto L = columns(m_factor,0,m_rank); noalias(C) = trans(L); m_cholesky.solve(C,left()); swap_columns_inverted(m_permutation,C); } } template void solve(matrix_expression& B, left)const{ swap_rows(m_permutation,B); if(m_rank == 0){//matrix is zero B().clear(); }else if(m_rank == m_factor.size1()){//matrix has full rank kernels::trsm(m_factor,B); kernels::trsm(trans(m_factor),B); }else{//matrix is missing rank auto L = columns(m_factor,0,m_rank); auto Z = eval_block(prod(trans(L),B)); m_cholesky.solve(Z,left()); m_cholesky.solve(Z,left()); noalias(B) = prod(L,Z); } swap_rows_inverted(m_permutation,B); } template void solve(matrix_expression& B, right)const{ //compute using symmetry of the system of equations auto transB = trans(B); solve(transB, left()); } template void solve(vector_expression& b, system_tag)const{ swap_rows(m_permutation,b); if(m_rank == 0){//matrix is zero b().clear(); }else if(m_rank == m_factor.size1()){//matrix has full rank kernels::trsv(m_factor,b); kernels::trsv(trans(m_factor),b); }else{//matrix is missing rank auto L = columns(m_factor,0,m_rank); auto z = eval_block(prod(trans(L),b)); m_cholesky.solve(z,left()); m_cholesky.solve(z,left()); noalias(b) = prod(L,z); } swap_rows_inverted(m_permutation,b); } private: std::size_t m_rank; MatrixStorage m_factor; cholesky_decomposition m_cholesky; permutation_matrix m_permutation; }; template class cg_solver:public solver_expression, typename MatA::device_type>{ public: typedef typename MatA::const_closure_type matrix_closure_type; typedef typename MatA::device_type device_type; cg_solver(matrix_closure_type const& e, double epsilon = 1.e-10, unsigned int max_iterations = 0) :m_expression(e), m_epsilon(epsilon), m_max_iterations(max_iterations){} template void solve( matrix_expression& B, left, double epsilon, unsigned max_iterations )const{ typename matrix_temporary::type X = B; cg(m_expression,X, B, epsilon, max_iterations); noalias(B) = X; } template void solve( matrix_expression& B, right, double epsilon, unsigned max_iterations )const{ auto transB = trans(B); typename transposed_matrix_temporary::type X = transB; cg(m_expression,X,transB, epsilon, max_iterations); noalias(transB) = X; } template void solve( vector_expression&b, system_tag, double epsilon, unsigned max_iterations )const{ typename vector_temporary::type x = b; cg(m_expression,x,b,epsilon,max_iterations); noalias(b) = x; } template void solve(vector_expression&b, system_tag tag)const{ solve(b, tag, m_epsilon, m_max_iterations); } template void solve(matrix_expression&B, system_tag tag)const{ solve(B, tag, m_epsilon, m_max_iterations); } private: template void cg( matrix_expression const& A, vector_expression& x, vector_expression const& b, double epsilon, unsigned int max_iterations )const{ REMORA_SIZE_CHECK(A().size1() == A().size2()); REMORA_SIZE_CHECK(A().size1() == b().size()); REMORA_SIZE_CHECK(A().size1() == x().size()); typedef typename vector_temporary::type vector_type; std::size_t dim = b().size(); //initialize point. vector_type residual = b - prod(A,x); //check if provided solution is better than starting at 0 if(norm_inf(residual) > norm_inf(b)){ x().clear(); residual = b; } vector_type next_residual(dim); //the next residual vector_type p = residual; //the search direction- initially it is the gradient direction vector_type Ap(dim); //stores prod(A,p) for(std::size_t iter = 0;; ++iter){ if(max_iterations != 0 && iter >= max_iterations) break; noalias(Ap) = prod(A,p); double rsqr = norm_sqr(residual); double alpha = rsqr/inner_prod(p,Ap); noalias(x) += alpha * p; noalias(next_residual) = residual - alpha * Ap; if(norm_inf(next_residual) < epsilon) break; double beta = inner_prod(next_residual,next_residual)/rsqr; p *= beta; noalias(p) += next_residual; swap(residual,next_residual); } } template void cg( matrix_expression const& A, matrix_expression& X, matrix_expression const& B, double epsilon, unsigned int max_iterations )const{ REMORA_SIZE_CHECK(A().size1() == A().size2()); REMORA_SIZE_CHECK(A().size1() == B().size1()); REMORA_SIZE_CHECK(A().size1() == X().size1()); REMORA_SIZE_CHECK(B().size2() == X().size2()); typedef typename vector_temporary::type vector_type; typedef typename matrix_temporary::type matrix_type; std::size_t dim = B().size1(); std::size_t num_rhs = B().size2(); //initialize gradient given the starting point matrix_type residual = B - prod(A,X); //check for each rhs whether the starting point is better than starting from scratch for(std::size_t i = 0; i != num_rhs; ++i){ if(norm_inf(column(residual,i)) <= norm_inf(column(residual,i))){ column(X,i).clear(); noalias(column(residual,i)) = column(B,i); } } vector_type next_residual(dim); //the next residual of a column matrix_type P = residual; //the search direction- initially it is the gradient direction matrix_type AP(dim, num_rhs); //stores prod(A,p) for(std::size_t iter = 0;; ++iter){ if(max_iterations != 0 && iter >= max_iterations) break; //compute the product for all rhs at the same time noalias(AP) = prod(A,P); //for each rhs apply a step of cg for(std::size_t i = 0; i != num_rhs; ++i){ auto r = column(residual,i); //skip this if we are done already //otherwise we might run into numerical troubles later on if(norm_inf(r) < epsilon) continue; auto x = column(X,i); auto p = column(P,i); auto Ap = column(AP,i); double rsqr = norm_sqr(r); double alpha = rsqr/inner_prod(p,Ap); noalias(x) += alpha * p; noalias(next_residual) = r - alpha * Ap; double beta = inner_prod(next_residual,next_residual)/rsqr; p *= beta; noalias(p) += next_residual; noalias(r) = next_residual; } //if all solutions are within tolerance, we are done if(max(abs(residual)) < epsilon) break; } } matrix_closure_type m_expression; double m_epsilon; unsigned int m_max_iterations; }; ///////////////////////////////////////////////////////////////// ////Traits connecting decompositions/solvers with tags ///////////////////////////////////////////////////////////////// struct symm_pos_def{ typedef symm_pos_def transposed_orientation;}; struct symm_semi_pos_def{ typedef symm_semi_pos_def transposed_orientation;}; struct indefinite_full_rank{ typedef indefinite_full_rank transposed_orientation;}; struct conjugate_gradient{ typedef conjugate_gradient transposed_orientation; double epsilon; unsigned max_iterations; conjugate_gradient(double epsilon = 1.e-10, unsigned max_iterations = 0) :epsilon(epsilon), max_iterations(max_iterations){} }; namespace detail{ template struct solver_traits; template struct solver_traits >{ class type: public solver_expression{ public: typedef typename MatA::const_closure_type matrix_closure_type; typedef typename MatA::device_type device_type; type(matrix_closure_type const& e, triangular_tag):m_matrix(e){} template void solve(matrix_expression& B, system_tag ){ kernels::trsm,system_tag >(m_matrix,B); } template void solve(vector_expression& b, system_tag ){ kernels::trsv,system_tag >(m_matrix,b); } private: matrix_closure_type m_matrix; }; }; template struct solver_traits{ struct type : public cholesky_decomposition::type>{ template type(M const& m, symm_pos_def) :cholesky_decomposition::type>(m){} }; }; template struct solver_traits{ struct type : public pivoting_lu_decomposition::type>{ template type(M const& m, indefinite_full_rank) :pivoting_lu_decomposition::type>(m){} }; }; template struct solver_traits{ struct type : public symm_pos_semi_definite_solver::type>{ template type(M const& m, symm_semi_pos_def) :symm_pos_semi_definite_solver::type>(m){} }; }; template struct solver_traits{ struct type : public cg_solver{ template type(M const& m, conjugate_gradient t):cg_solver(m,t.epsilon,t.max_iterations){} }; }; } template struct solver:public detail::solver_traits::type{ template solver(M const& m, SolverType t = SolverType()): detail::solver_traits::type(m,t){} }; } #endif