## # mae.simplePCAanalysis() - compute PCA on data and generate plots in a PDF file. # For using the color based plotting function, the (uniqGroups, group) # are required. # It creates one PDF file with 7 plots: # Variance X Components, # three plots: (PC1 vs PC2, PC2 vs PC3, PC1 vs PC3) for "arrays" or "genes", # three sorted Loadings plots for PC1, PC2 and PC3. # Args: # data - the horizontally stacked array data (genes X array samples) # plotFile - optional name of the PDF file, else use "simplePCA". # mode - defines how the data is to be analyzed for computing PCA: # "arrays" - analyzed by gene components (Gene Principal Components) # "genes" - analyzed by array components (Array Principal Components) # scale.data - apply scaling in prcomp, else do not scale # uniqGroups - the unique condition names. # group - the condition names/sample. This vector size = number samples # allowableEigenVectors - the number of allowable EigenVectors (default 6) # Return: pca, eigenVals, eigenVec, points3D, loadings # Developer: Shyam(shyamss@hotmail.com). Edited by P. Lemkin # Date : 07/18/2002 # # Notes: # 1. This routine does the Principal components analysis(mva) for data either # [Genes(rows), Arrays(columns)] or vice versa. This makes use of the PCA # algorithm in the "mva" package of "R". We use the prcomp as the preferred # as the performance is better than the princomp method. The eigen vectors of # of the first two components may have a "-1" difference. This does not # affect the analysis, it is just the coordinate that will have the shift # amongst the two methods. # 2. Generation of the detailed information about the PCA # including an analysis file with more details, pdf file with the plots # File that will contain the 3D data points that can be used by a 3D # viewer program. # 3. Experimenting with the plotting of the coefficients of the variables # across the components. # 4. Also added to the code is the sorting of returned eigen vector(loading) # values, so that for the first three components we know the effect of the # original variable. # 5. When the matrix(rows,columns) based on the mode ( genes/arrays) has the # the condition of the NumberOfRows < NumberOfColumns, the Eigen values # upto NumberOfRows will be non-zero values and the remainder will have # zero/singular values. Please read the following reference for more # information. Numerical recipes(SVD): # http://www.library.cornell.edu/nr/bookcpdf/c2-6.pdf # 6. Noticed that the number of printable eigen vectors were restricted to the # first 6 but in cases when we have less than 6 we need to handle likewise. # 7. This code is also used in the mAdb system (http://nciarray.nci.nih.gov/) # from which it has been adapted. ## mae.simplePCAanalysis <- function(data, plotFile="simplePCA", mode="arrays", scale.data= TRUE, uniqGroups, group, allowableEigenVectors=6 ) { # mae.simplePCAanalysis if(!require(mva)) return("library mva not found") # [1] Computing the PCA if (mode == "arrays") { # for arrays, compute PCA on the transpose d1pca <- prcomp(t(data),scale=scale.data) } else { #for genes, compute the PCA d1pca <- prcomp(data,scale=scale.data) } # [2] Compute the number of printable Eigen Vectors # We limit it here to 6 which can of course be set to higher # values but the first few ( 3 or so ) should account for # maximum variance. allowableEigenVectors <- 6 curEigenVectors <- ncol(d1pca$rotation) if (curEigenVectors < allowableEigenVectors) nEigenVecs <- min(curEigenVectors,allowableEigenVectors) else nEigenVecs <-allowableEigenVectors # [3] Create simple vectors of the PCA results eigenVals <- d1pca$sdev^2 eigenVec <- d1pca$rotation[,1:6] # Create 3D data for the points. points3D <- cbind(d1pca$x[,1],d1pca$x[,2],d1pca$x[,3]) # [4] Sorting the loadings L1 <- data.frame(ID=c(1:nrow(d1pca$rotation)), VAL=d1pca$rotation[,1]) L2 <- data.frame(ID=c(1:nrow(d1pca$rotation)), VAL=d1pca$rotation[,2]) L3 <- data.frame(ID=c(1:nrow(d1pca$rotation)), VAL=d1pca$rotation[,3]) L1s <- L1[sort.list(L1$VAL),] L2s <- L2[sort.list(L2$VAL),] L3s <- L3[sort.list(L3$VAL),] loadings <- list(L1s, L2s, L3s) # for return list # [5] Put all analysis plots in one PDF file as 2x2 panels/page. # 1. Variance across principal components # 2. PC1 vs PC2 (arrays or genes) # 3. PC2 vs PC3 (arrays or genes) # 4. PC1 vs PC3 (arrays or genes) # 5. Loading: coefficient PC1 vs variable effects # 6. Loading: coefficient PC2 vs variable effects # 7. Loading: coefficient PC3 vs variable effects # pdf(plotFile,encoding="ISOLatin1",width=12,height=12) opar <- par(mfrow=c(2,2),pch=19,lty=2) maxComp1 <- max(abs(d1pca$x[,1])) xrange <- c(-maxComp1,maxComp1) yrange <- c(-maxComp1,maxComp1) # [5.1] Plot Variance across Principal Components for arrays or genes if (mode == "arrays") { # process arrays data plot(d1pca,main="Variance across Principal Components by Arrays") # Group Color based plots mae.colorBasedMatrixPlotting(uniqGroups,group,d1pca$x,xindex=1,yindex=2, gbgcolor="white", mainTitle="Arrays: PC1 Vs PC2", xlabel="PC1",ylabel="PC2", xrange=xrange,yrange=yrange) mae.colorBasedMatrixPlotting(uniqGroups,group,d1pca$x,xindex=2,yindex=3, gbgcolor="white", mainTitle="Arrays: PC2 Vs PC3", xlabel="PC2",ylabel="PC3", xrange=xrange,yrange=yrange) mae.colorBasedMatrixPlotting(uniqGroups,group,d1pca$x,xindex=1,yindex=3, gbgcolor="white", mainTitle="Arrays: PC1 Vs PC3", xlabel="PC1",ylabel="PC3", xrange=xrange,yrange=yrange) } # process arrays data else { # process genes data plot(d1pca,main="Variance across Principal Components by Genes") plot(d1pca$x[,1],d1pca$x[,2],xlim=xrange,ylim=yrange, main="Genes: PC1 Vs PC2",xlab="PC1",ylab="PC2",pch=20) plot(d1pca$x[,2],d1pca$x[,3],xlim=xrange,ylim=yrange, main="Genes: PC2 Vs PC3",xlab="PC2",ylab="PC3",pch=20) plot(d1pca$x[,1],d1pca$x[,3],xlim=xrange,ylim=yrange, main="Genes: PC1 Vs PC3",xlab="PC1",ylab="PC3",pch=20) } # process genes data # [5.2] Plot Sorted loadings # Eigen Vectors(contained in the columns of the loadings) if (mode == "arrays") mainLTitle= "Sorted genes coefficient PC1 loading" else mainLTitle= "Sorted arrays coefficient PC1 loading" plot(L1s$VAL[nrow(L1s):1],xlab="Variable Effect",ylab="Coefficient PC1", main=mainLTitle,type="l") text(L1s$VAL[nrow(L1s):1],labels=paste(L1s$ID),col=4) if (mode == "arrays") mainLTitle= "Sorted genes coefficient PC2 loading" else mainLTitle= "Sorted arrays coefficient PC2 loading" plot(L2s$VAL[nrow(L2s):1],xlab="Variable Effect",ylab="Coefficient PC2", main=mainLTitle,type="l") text(L2s$VAL[nrow(L2s):1],labels=paste(L2s$ID),col=4) if (mode == "arrays") mainLTitle= "Sorted genes coefficient PC3 loading" else mainLTitle= "Sorted arrays coefficient PC3 loading" plot(L3s$VAL[nrow(L1s):1],xlab="Variable Effect",ylab="Coefficient PC3", main=mainLTitle,type="l") text(L3s$VAL[nrow(L3s):1],labels=paste(L3s$ID),col=4) dev.off(); return(pca=d1pca, eigenVals=eigenVals, eigenVec=eigenVec, nEigenVecs=nEigenVecs, points3D=points3D, loadings=loadings) } # end of mae.simplePCAanalysis