/** >HEADER Copyright (c) 2013 Rob Patro robp@cs.cmu.edu This file is part of Salmon. Salmon is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Salmon is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Salmon. If not, see .

#ifdef DEBUG #include #endif #include "pca.h" #include using namespace std; using namespace Eigen; int Pca::Calculate(vector& x, const unsigned int& nrows, const unsigned int& ncols, const bool is_corr, const bool is_center, const bool is_scale) { _ncols = ncols; _nrows = nrows; _is_corr = is_corr; _is_center = is_center; _is_scale = is_scale; if (x.size() != _nrows * _ncols) { return -1; } if ((1 == _ncols) || (1 == nrows)) { return -1; } // Convert vector to Eigen 2-dimensional matrix // Map _xXf(x.data(), _nrows, _ncols); _xXf.resize(_nrows, _ncols); for (unsigned int i = 0; i < _nrows; ++i) { for (unsigned int j = 0; j < _ncols; ++j) { _xXf(i, j) = x[j + i * _ncols]; } } // Mean and standard deviation for each column VectorXf mean_vector(_ncols); mean_vector = _xXf.colwise().mean(); VectorXf sd_vector(_ncols); unsigned int zero_sd_num = 0; float denom = static_cast((_nrows > 1) ? _nrows - 1 : 1); for (unsigned int i = 0; i < _ncols; ++i) { VectorXf curr_col = VectorXf::Constant(_nrows, mean_vector(i)); // mean(x) for column x curr_col = _xXf.col(i) - curr_col; // x - mean(x) curr_col = curr_col.array().square(); // (x-mean(x))^2 sd_vector(i) = sqrt((curr_col.sum()) / denom); if (0 == sd_vector(i)) { zero_sd_num++; } } // If colums with sd == 0 are too many, // don't continue calculations if (1 > _ncols - zero_sd_num) { return -1; } // Delete columns where sd == 0 MatrixXf tmp(_nrows, _ncols - zero_sd_num); VectorXf tmp_mean_vector(_ncols - zero_sd_num); unsigned int curr_col_num = 0; for (unsigned int i = 0; i < _ncols; ++i) { if (0 != sd_vector(i)) { tmp.col(curr_col_num) = _xXf.col(i); tmp_mean_vector(curr_col_num) = mean_vector(i); curr_col_num++; } else { _eliminated_columns.push_back(i); } } _ncols -= zero_sd_num; _xXf = tmp; mean_vector = tmp_mean_vector; tmp.resize(0, 0); tmp_mean_vector.resize(0); // Shift to zero if (true == _is_center) { for (unsigned int i = 0; i < _ncols; ++i) { _xXf.col(i) -= VectorXf::Constant(_nrows, mean_vector(i)); } } // Scale to unit variance if ((false == _is_corr) || (true == _is_scale)) { for (unsigned int i = 0; i < _ncols; ++i) { _xXf.col(i) /= sqrt(_xXf.col(i).array().square().sum() / denom); } } #ifdef DEBUG cout << "\nScaled matrix:\n"; cout << _xXf << endl; cout << "\nMean before scaling:\n" << mean_vector.transpose(); cout << "\nStandard deviation before scaling:\n" << sd_vector.transpose(); #endif // When _nrows < _ncols then svd will be used. // If corr is true and _nrows > _ncols then will be used correlation matrix // (TODO): What about covariance? if ((_nrows < _ncols) || (false == _is_corr)) { // Singular Value Decomposition is on _method = "svd"; JacobiSVD svd(_xXf, ComputeThinV); VectorXf eigen_singular_values = svd.singularValues(); VectorXf tmp_vec = eigen_singular_values.array().square(); float tmp_sum = tmp_vec.sum(); tmp_vec /= tmp_sum; // PC's standard deviation and // PC's proportion of variance _kaiser = 0; unsigned int lim = (_nrows < _ncols) ? _nrows : _ncols; for (unsigned int i = 0; i < lim; ++i) { _sd.push_back(eigen_singular_values(i) / sqrt(denom)); if (_sd[i] >= 1) { _kaiser = i + 1; } _prop_of_var.push_back(tmp_vec(i)); } #ifdef DEBUG cout << "\n\nStandard deviations for PCs:\n"; copy(_sd.begin(), _sd.end(), std::ostream_iterator(std::cout, " ")); cout << "\n\nKaiser criterion: PC #" << _kaiser << endl; #endif tmp_vec.resize(0); // PC's cumulative proportion _thresh95 = 1; _cum_prop.push_back(_prop_of_var[0]); for (unsigned int i = 1; i < _prop_of_var.size(); ++i) { _cum_prop.push_back(_cum_prop[i - 1] + _prop_of_var[i]); if (_cum_prop[i] < 0.95) { _thresh95 = i + 1; } } #ifdef DEBUG cout << "\nCumulative proportion:\n"; copy(_cum_prop.begin(), _cum_prop.end(), std::ostream_iterator(std::cout, " ")); cout << "\n\nThresh95 criterion: PC #" << _thresh95 << endl; #endif // Scores MatrixXf eigen_scores = _xXf * svd.matrixV(); #ifdef DEBUG cout << "\n\nRotated values (scores):\n" << eigen_scores; #endif _scores.reserve(lim * lim); for (unsigned int i = 0; i < lim; ++i) { for (unsigned int j = 0; j < lim; ++j) { _scores.push_back(eigen_scores(i, j)); } } eigen_scores.resize(0, 0); #ifdef DEBUG cout << "\n\nScores in vector:\n"; copy(_scores.begin(), _scores.end(), std::ostream_iterator(std::cout, " ")); cout << "\n"; #endif } else { // COR OR COV MATRICES ARE HERE _method = "cor"; // Calculate covariance matrix MatrixXf eigen_cov; // = MatrixXf::Zero(_ncols, _ncols); VectorXf sds; // (TODO) Should be weighted cov matrix, even if is_center == false eigen_cov = (1.0 / (_nrows /*-1*/)) * _xXf.transpose() * _xXf; sds = eigen_cov.diagonal().array().sqrt(); MatrixXf outer_sds = sds * sds.transpose(); eigen_cov = eigen_cov.array() / outer_sds.array(); outer_sds.resize(0, 0); // ?If data matrix is scaled, covariance matrix is equal to correlation matrix #ifdef DEBUG cout << eigen_cov << endl; #endif EigenSolver edc(eigen_cov); VectorXf eigen_eigenvalues = edc.eigenvalues().real(); #ifdef DEBUG cout << endl << eigen_eigenvalues.transpose() << endl; #endif MatrixXf eigen_eigenvectors = edc.eigenvectors().real(); #ifdef DEBUG cout << endl << eigen_eigenvectors << endl; #endif // The eigenvalues and eigenvectors are not sorted in any particular order. // So, we should sort them typedef pair eigen_pair; vector ep; for (unsigned int i = 0; i < _ncols; ++i) { ep.push_back(make_pair(eigen_eigenvalues(i), i)); } sort(ep.begin(), ep.end()); // Ascending order by default // Sort them all in descending order MatrixXf eigen_eigenvectors_sorted = MatrixXf::Zero(eigen_eigenvectors.rows(), eigen_eigenvectors.cols()); VectorXf eigen_eigenvalues_sorted = VectorXf::Zero(_ncols); int colnum = 0; int i = ep.size() - 1; for (; i > -1; i--) { eigen_eigenvalues_sorted(colnum) = ep[i].first; eigen_eigenvectors_sorted.col(colnum++) += eigen_eigenvectors.col(ep[i].second); } #ifdef DEBUG cout << endl << eigen_eigenvalues_sorted.transpose() << endl; cout << endl << eigen_eigenvectors_sorted << endl; #endif // We don't need not sorted arrays anymore eigen_eigenvalues.resize(0); eigen_eigenvectors.resize(0, 0); _sd.clear(); _prop_of_var.clear(); _kaiser = 0; float tmp_sum = eigen_eigenvalues_sorted.sum(); for (unsigned int i = 0; i < _ncols; ++i) { _sd.push_back(sqrt(eigen_eigenvalues_sorted(i))); if (_sd[i] >= 1) { _kaiser = i + 1; } _prop_of_var.push_back(eigen_eigenvalues_sorted(i) / tmp_sum); } #ifdef DEBUG cout << "\nStandard deviations for PCs:\n"; copy(_sd.begin(), _sd.end(), std::ostream_iterator(std::cout, " ")); cout << "\nProportion of variance:\n"; copy(_prop_of_var.begin(), _prop_of_var.end(), std::ostream_iterator(std::cout, " ")); cout << "\nKaiser criterion: PC #" << _kaiser << endl; #endif // PC's cumulative proportion _cum_prop.clear(); _thresh95 = 1; _cum_prop.push_back(_prop_of_var[0]); for (unsigned int i = 1; i < _prop_of_var.size(); ++i) { _cum_prop.push_back(_cum_prop[i - 1] + _prop_of_var[i]); if (_cum_prop[i] < 0.95) { _thresh95 = i + 1; } } #ifdef DEBUG cout << "\n\nCumulative proportions:\n"; copy(_cum_prop.begin(), _cum_prop.end(), std::ostream_iterator(std::cout, " ")); cout << "\n\n95% threshold: PC #" << _thresh95 << endl; #endif // Scores for PCA with correlation matrix // Scale before calculating new values for (unsigned int i = 0; i < _ncols; ++i) { _xXf.col(i) /= sds(i); } sds.resize(0); MatrixXf eigen_scores = _xXf * eigen_eigenvectors_sorted; #ifdef DEBUG cout << "\n\nRotated values (scores):\n" << eigen_scores; #endif _scores.clear(); _scores.reserve(_ncols * _nrows); for (unsigned int i = 0; i < _nrows; ++i) { for (unsigned int j = 0; j < _ncols; ++j) { _scores.push_back(eigen_scores(i, j)); } } eigen_scores.resize(0, 0); #ifdef DEBUG cout << "\n\nScores in vector:\n"; copy(_scores.begin(), _scores.end(), std::ostream_iterator(std::cout, " ")); cout << "\n"; #endif } return 0; } std::vector Pca::sd(void) { return _sd; }; std::vector Pca::prop_of_var(void) { return _prop_of_var; }; std::vector Pca::cum_prop(void) { return _cum_prop; }; std::vector Pca::scores(void) { return _scores; }; std::vector Pca::eliminated_columns(void) { return _eliminated_columns; } string Pca::method(void) { return _method; } unsigned int Pca::kaiser(void) { return _kaiser; }; unsigned int Pca::thresh95(void) { return _thresh95; }; unsigned int Pca::ncols(void) { return _ncols; } unsigned int Pca::nrows(void) { return _nrows; } bool Pca::is_scale(void) { return _is_scale; } bool Pca::is_center(void) { return _is_center; } Pca::Pca(void) { _nrows = 0; _ncols = 0; // Variables will be scaled by default _is_center = true; _is_scale = true; // By default will be used singular value decomposition _method = "svd"; _is_corr = false; _kaiser = 0; _thresh95 = 1; } Pca::~Pca(void) { _xXf.resize(0, 0); _x.clear(); }